Libraries
library(leaps)
library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library(tree)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.0-2
library(MASS)
Matrix column entries (attributes): name - ASCII subject name and recording number MDVP:Fo(Hz) - Average vocal fundamental frequency MDVP:Fhi(Hz) - Maximum vocal fundamental frequency MDVP:Flo(Hz) - Minimum vocal fundamental frequency MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several measures of variation in fundamental frequency MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude NHR,HNR - Two measures of ratio of noise to tonal components in the voice status - Health status of the subject (one) - Parkinson’s, (zero) - healthy RPDE,D2 - Two nonlinear dynamical complexity measures DFA - Signal fractal scaling exponent spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation
pd = read.csv("/Users/piachouaifaty/parkinson.csv")
length(which(pd$status==0))
## [1] 48
48 measurements for healthy patients
length(which(pd$status==1))
## [1] 147
147 measurements for PD patients
pdavg = pd
Removing the recording number from the names to be able to match and average them
#substitutes the characters after the last underscore with a "", so removes them
pdavg$name = sub("_[^_]+$", "", pdavg$name)
head(pdavg)
## name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 1 phon_R01_S01 119.992 157.302 74.997 0.00784
## 2 phon_R01_S01 122.400 148.650 113.819 0.00968
## 3 phon_R01_S01 116.682 131.111 111.555 0.01050
## 4 phon_R01_S01 116.676 137.871 111.366 0.00997
## 5 phon_R01_S01 116.014 141.781 110.655 0.01284
## 6 phon_R01_S01 120.552 131.162 113.787 0.00968
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 1 0.00007 0.00370 0.00554 0.01109 0.04374 0.426
## 2 0.00008 0.00465 0.00696 0.01394 0.06134 0.626
## 3 0.00009 0.00544 0.00781 0.01633 0.05233 0.482
## 4 0.00009 0.00502 0.00698 0.01505 0.05492 0.517
## 5 0.00011 0.00655 0.00908 0.01966 0.06425 0.584
## 6 0.00008 0.00463 0.00750 0.01388 0.04701 0.456
## Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR status RPDE
## 1 0.02182 0.03130 0.02971 0.06545 0.02211 21.033 1 0.414783
## 2 0.03134 0.04518 0.04368 0.09403 0.01929 19.085 1 0.458359
## 3 0.02757 0.03858 0.03590 0.08270 0.01309 20.651 1 0.429895
## 4 0.02924 0.04005 0.03772 0.08771 0.01353 20.644 1 0.434969
## 5 0.03490 0.04825 0.04465 0.10470 0.01767 19.649 1 0.417356
## 6 0.02328 0.03526 0.03243 0.06985 0.01222 21.378 1 0.415564
## DFA spread1 spread2 D2 PPE
## 1 0.815285 -4.813031 0.266482 2.301442 0.284654
## 2 0.819521 -4.075192 0.335590 2.486855 0.368674
## 3 0.825288 -4.443179 0.311173 2.342259 0.332634
## 4 0.819235 -4.117501 0.334147 2.405554 0.368975
## 5 0.823484 -3.747787 0.234513 2.332180 0.410335
## 6 0.825069 -4.242867 0.299111 2.187560 0.357775
Averaging all the rows using aggregate by name (after removing the recording number, we are left with the individual ids)
pdavg=aggregate(pdavg[,2:24], list(pdavg$name), mean)
head(pdavg)
## Group.1 MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 1 phon_R01_S01 118.71933 141.3128 106.02983 0.010085000
## 2 phon_R01_S02 99.77033 121.8943 95.41317 0.004585000
## 3 phon_R01_S04 147.34617 216.8675 87.53233 0.004346667
## 4 phon_R01_S05 159.83767 181.6302 86.76717 0.006246667
## 5 phon_R01_S06 150.64467 208.2643 78.27833 0.005230000
## 6 phon_R01_S07 200.26683 210.8843 194.36617 0.002163333
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer
## 1 8.666667e-05 0.004998333 0.007311667 0.014991667 0.05393167
## 2 5.000000e-05 0.002325000 0.002856667 0.006978333 0.02166833
## 3 3.000000e-05 0.001760000 0.002320000 0.005285000 0.01934333
## 4 4.000000e-05 0.003061667 0.003421667 0.009188333 0.04333667
## 5 3.666667e-05 0.002725000 0.002838333 0.008173333 0.02136667
## 6 9.666667e-06 0.001175000 0.001281667 0.003523333 0.01080333
## MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## 1 0.51516667 0.028025000 0.03977000 0.03734833 0.08407333 0.016318333
## 2 0.19433333 0.011041667 0.01311333 0.01812333 0.03312500 0.008916667
## 3 0.18166667 0.009383333 0.01100333 0.01841000 0.02814833 0.013080000
## 4 0.38816667 0.020491667 0.02655000 0.04445833 0.06147667 0.025608333
## 5 0.20966667 0.009673333 0.01262500 0.02072500 0.02902333 0.014891667
## 6 0.09566667 0.005383333 0.00687000 0.00819500 0.01614833 0.001495000
## HNR status RPDE DFA spread1 spread2 D2 PPE
## 1 20.40667 1 0.4284877 0.8213137 -4.239926 0.2968360 2.342642 0.35384117
## 2 22.99733 1 0.5984292 0.7780167 -5.420414 0.3082893 2.287428 0.23401917
## 3 23.89967 1 0.5216600 0.6458433 -5.337281 0.2492883 2.360638 0.23200700
## 4 19.05867 1 0.6267230 0.6958855 -4.560947 0.2784825 2.787870 0.31065783
## 5 24.76200 1 0.4327235 0.7196750 -6.223537 0.2282963 2.440360 0.16493883
## 6 30.99217 0 0.3955782 0.7414823 -7.589537 0.1730488 1.795701 0.06811333
length(which(pdavg$status==0))
## [1] 8
length(which(pdavg$status==1))
## [1] 24
8 healthy patients and 24 patients with PD
pd$status=as.factor(pd$status)
summary(pd)
## name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## Length:195 Min. : 88.33 Min. :102.1 Min. : 65.48
## Class :character 1st Qu.:117.57 1st Qu.:134.9 1st Qu.: 84.29
## Mode :character Median :148.79 Median :175.8 Median :104.31
## Mean :154.23 Mean :197.1 Mean :116.32
## 3rd Qu.:182.77 3rd Qu.:224.2 3rd Qu.:140.02
## Max. :260.11 Max. :592.0 Max. :239.17
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ
## Min. :0.001680 Min. :7.000e-06 Min. :0.000680 Min. :0.000920
## 1st Qu.:0.003460 1st Qu.:2.000e-05 1st Qu.:0.001660 1st Qu.:0.001860
## Median :0.004940 Median :3.000e-05 Median :0.002500 Median :0.002690
## Mean :0.006220 Mean :4.396e-05 Mean :0.003306 Mean :0.003446
## 3rd Qu.:0.007365 3rd Qu.:6.000e-05 3rd Qu.:0.003835 3rd Qu.:0.003955
## Max. :0.033160 Max. :2.600e-04 Max. :0.021440 Max. :0.019580
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## Min. :0.002040 Min. :0.00954 Min. :0.0850 Min. :0.004550
## 1st Qu.:0.004985 1st Qu.:0.01650 1st Qu.:0.1485 1st Qu.:0.008245
## Median :0.007490 Median :0.02297 Median :0.2210 Median :0.012790
## Mean :0.009920 Mean :0.02971 Mean :0.2823 Mean :0.015664
## 3rd Qu.:0.011505 3rd Qu.:0.03789 3rd Qu.:0.3500 3rd Qu.:0.020265
## Max. :0.064330 Max. :0.11908 Max. :1.3020 Max. :0.056470
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## Min. :0.00570 Min. :0.00719 Min. :0.01364 Min. :0.000650
## 1st Qu.:0.00958 1st Qu.:0.01308 1st Qu.:0.02474 1st Qu.:0.005925
## Median :0.01347 Median :0.01826 Median :0.03836 Median :0.011660
## Mean :0.01788 Mean :0.02408 Mean :0.04699 Mean :0.024847
## 3rd Qu.:0.02238 3rd Qu.:0.02940 3rd Qu.:0.06080 3rd Qu.:0.025640
## Max. :0.07940 Max. :0.13778 Max. :0.16942 Max. :0.314820
## HNR status RPDE DFA spread1
## Min. : 8.441 0: 48 Min. :0.2566 Min. :0.5743 Min. :-7.965
## 1st Qu.:19.198 1:147 1st Qu.:0.4213 1st Qu.:0.6748 1st Qu.:-6.450
## Median :22.085 Median :0.4960 Median :0.7223 Median :-5.721
## Mean :21.886 Mean :0.4985 Mean :0.7181 Mean :-5.684
## 3rd Qu.:25.076 3rd Qu.:0.5876 3rd Qu.:0.7619 3rd Qu.:-5.046
## Max. :33.047 Max. :0.6852 Max. :0.8253 Max. :-2.434
## spread2 D2 PPE
## Min. :0.006274 Min. :1.423 Min. :0.04454
## 1st Qu.:0.174350 1st Qu.:2.099 1st Qu.:0.13745
## Median :0.218885 Median :2.362 Median :0.19405
## Mean :0.226510 Mean :2.382 Mean :0.20655
## 3rd Qu.:0.279234 3rd Qu.:2.636 3rd Qu.:0.25298
## Max. :0.450493 Max. :3.671 Max. :0.52737
pairs(pd[,-1], col=pd$status)
#red=parkinson's
plot(pd$PPE, pd$spread1, col=pd$status)
plot(pd$PPE, pd$spread1, col=pd$status)
pd2=pd[,-1]
rownames(pd2) = pd[,1]
pd2=pd2[,-17]
par(mfrow=c(3,3))
for (i in colnames(pd2))
{hist(pd2[,i], main=i)}
par(mfrow=c(1,3))
for (i in colnames(pd2))
{boxplot(pd2[,i], main=i)}
outl=boxplot.stats(pd2[,"MDVP.Flo.Hz."])$out
ind=which(pd2[,"MDVP.Flo.Hz."] %in% c(outl))
pd2[c(ind),]
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## phon_R01_S10_1 237.226 247.326 225.227 0.00298
## phon_R01_S10_2 241.404 248.834 232.483 0.00281
## phon_R01_S10_3 243.439 250.912 232.435 0.00210
## phon_R01_S10_4 242.852 255.034 227.911 0.00225
## phon_R01_S10_5 245.510 262.090 231.848 0.00235
## phon_R01_S17_4 228.832 234.619 223.634 0.00296
## phon_R01_S42_2 237.323 243.709 229.256 0.00303
## phon_R01_S42_3 260.105 264.919 237.303 0.00339
## phon_R01_S42_6 244.990 272.210 239.170 0.00451
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer
## phon_R01_S10_1 1e-05 0.00169 0.00182 0.00507 0.01752
## phon_R01_S10_2 1e-05 0.00157 0.00173 0.00470 0.01760
## phon_R01_S10_3 9e-06 0.00109 0.00137 0.00327 0.01419
## phon_R01_S10_4 9e-06 0.00117 0.00139 0.00350 0.01494
## phon_R01_S10_5 1e-05 0.00127 0.00148 0.00380 0.01608
## phon_R01_S17_4 1e-05 0.00175 0.00155 0.00526 0.01644
## phon_R01_S42_2 1e-05 0.00173 0.00159 0.00519 0.01242
## phon_R01_S42_3 1e-05 0.00205 0.00186 0.00616 0.02030
## phon_R01_S42_6 2e-05 0.00279 0.00237 0.00837 0.01897
## MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA
## phon_R01_S10_1 0.164 0.01035 0.01024 0.01133 0.03104
## phon_R01_S10_2 0.154 0.01006 0.01038 0.01251 0.03017
## phon_R01_S10_3 0.126 0.00777 0.00898 0.01033 0.02330
## phon_R01_S10_4 0.134 0.00847 0.00879 0.01014 0.02542
## phon_R01_S10_5 0.141 0.00906 0.00977 0.01149 0.02719
## phon_R01_S17_4 0.145 0.00882 0.01075 0.01179 0.02647
## phon_R01_S42_2 0.116 0.00696 0.00747 0.00882 0.02089
## phon_R01_S42_3 0.197 0.01186 0.01230 0.01367 0.03557
## phon_R01_S42_6 0.181 0.01084 0.01121 0.01255 0.03253
## NHR HNR RPDE DFA spread1 spread2 D2
## phon_R01_S10_1 0.00740 22.736 0.305062 0.654172 -7.310550 0.098648 2.416838
## phon_R01_S10_2 0.00675 23.145 0.457702 0.634267 -6.793547 0.158266 2.256699
## phon_R01_S10_3 0.00454 25.368 0.438296 0.635285 -7.057869 0.091608 2.330716
## phon_R01_S10_4 0.00476 25.032 0.431285 0.638928 -6.995820 0.102083 2.365800
## phon_R01_S10_5 0.00476 24.602 0.467489 0.631653 -7.156076 0.127642 2.392122
## phon_R01_S17_4 0.00351 25.964 0.256570 0.683296 -7.245620 0.018689 2.498224
## phon_R01_S42_2 0.00533 24.679 0.384868 0.626710 -7.018057 0.176316 1.852402
## phon_R01_S42_3 0.00910 21.083 0.440988 0.628058 -7.517934 0.160414 1.881767
## phon_R01_S42_6 0.01049 21.528 0.522812 0.646818 -7.304500 0.171088 2.095237
## PPE
## phon_R01_S10_1 0.095032
## phon_R01_S10_2 0.117399
## phon_R01_S10_3 0.091470
## phon_R01_S10_4 0.102706
## phon_R01_S10_5 0.097336
## phon_R01_S17_4 0.093534
## phon_R01_S42_2 0.091604
## phon_R01_S42_3 0.075587
## phon_R01_S42_6 0.096220
pd[c(ind),]
## name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 43 phon_R01_S10_1 237.226 247.326 225.227 0.00298
## 44 phon_R01_S10_2 241.404 248.834 232.483 0.00281
## 45 phon_R01_S10_3 243.439 250.912 232.435 0.00210
## 46 phon_R01_S10_4 242.852 255.034 227.911 0.00225
## 47 phon_R01_S10_5 245.510 262.090 231.848 0.00235
## 64 phon_R01_S17_4 228.832 234.619 223.634 0.00296
## 167 phon_R01_S42_2 237.323 243.709 229.256 0.00303
## 168 phon_R01_S42_3 260.105 264.919 237.303 0.00339
## 171 phon_R01_S42_6 244.990 272.210 239.170 0.00451
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 43 1e-05 0.00169 0.00182 0.00507 0.01752 0.164
## 44 1e-05 0.00157 0.00173 0.00470 0.01760 0.154
## 45 9e-06 0.00109 0.00137 0.00327 0.01419 0.126
## 46 9e-06 0.00117 0.00139 0.00350 0.01494 0.134
## 47 1e-05 0.00127 0.00148 0.00380 0.01608 0.141
## 64 1e-05 0.00175 0.00155 0.00526 0.01644 0.145
## 167 1e-05 0.00173 0.00159 0.00519 0.01242 0.116
## 168 1e-05 0.00205 0.00186 0.00616 0.02030 0.197
## 171 2e-05 0.00279 0.00237 0.00837 0.01897 0.181
## Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR status
## 43 0.01035 0.01024 0.01133 0.03104 0.00740 22.736 0
## 44 0.01006 0.01038 0.01251 0.03017 0.00675 23.145 0
## 45 0.00777 0.00898 0.01033 0.02330 0.00454 25.368 0
## 46 0.00847 0.00879 0.01014 0.02542 0.00476 25.032 0
## 47 0.00906 0.00977 0.01149 0.02719 0.00476 24.602 0
## 64 0.00882 0.01075 0.01179 0.02647 0.00351 25.964 0
## 167 0.00696 0.00747 0.00882 0.02089 0.00533 24.679 0
## 168 0.01186 0.01230 0.01367 0.03557 0.00910 21.083 0
## 171 0.01084 0.01121 0.01255 0.03253 0.01049 21.528 0
## RPDE DFA spread1 spread2 D2 PPE
## 43 0.305062 0.654172 -7.310550 0.098648 2.416838 0.095032
## 44 0.457702 0.634267 -6.793547 0.158266 2.256699 0.117399
## 45 0.438296 0.635285 -7.057869 0.091608 2.330716 0.091470
## 46 0.431285 0.638928 -6.995820 0.102083 2.365800 0.102706
## 47 0.467489 0.631653 -7.156076 0.127642 2.392122 0.097336
## 64 0.256570 0.683296 -7.245620 0.018689 2.498224 0.093534
## 167 0.384868 0.626710 -7.018057 0.176316 1.852402 0.091604
## 168 0.440988 0.628058 -7.517934 0.160414 1.881767 0.075587
## 171 0.522812 0.646818 -7.304500 0.171088 2.095237 0.096220
I check some of the outliers in the MDVP.Flo.Hz. column.
5 of them come from the same individual, and the number of “outliers” is very high. Also, the data is skewed in general, so I don’t remove any of the outliers. Since the majority of measurements are diseased, and the outliers in one of the columns are all control, I think all the “outliers” are actually just control measurements.
I check a few more.
outl=boxplot.stats(pd2[,"MDVP.Fhi.Hz."])$out
ind=which(pd2[,"MDVP.Fhi.Hz."] %in% c(outl))
pd[c(ind),]
## name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 74 phon_R01_S19_2 112.014 588.518 107.024 0.00533
## 103 phon_R01_S24_6 139.224 586.567 66.157 0.03011
## 116 phon_R01_S27_1 151.872 492.892 69.085 0.00856
## 117 phon_R01_S27_2 158.219 442.557 71.948 0.00476
## 118 phon_R01_S27_3 170.756 450.247 79.032 0.00555
## 119 phon_R01_S27_4 178.285 442.824 82.063 0.00462
## 121 phon_R01_S27_6 128.940 479.697 88.251 0.00581
## 150 phon_R01_S35_4 202.632 565.740 177.258 0.01627
## 187 phon_R01_S49_4 116.556 592.030 86.228 0.00496
## 188 phon_R01_S49_5 116.342 581.289 94.246 0.00267
## 194 phon_R01_S50_5 198.764 396.961 74.904 0.00740
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 74 0.00005 0.00268 0.00329 0.00805 0.02448 0.226
## 103 0.00022 0.01854 0.01628 0.05563 0.09419 0.930
## 116 0.00006 0.00404 0.00385 0.01211 0.01843 0.235
## 117 0.00003 0.00214 0.00207 0.00642 0.01458 0.148
## 118 0.00003 0.00244 0.00261 0.00731 0.01725 0.175
## 119 0.00003 0.00157 0.00194 0.00472 0.01279 0.129
## 121 0.00005 0.00241 0.00314 0.00723 0.02008 0.221
## 150 0.00008 0.00919 0.00963 0.02756 0.07170 0.833
## 187 0.00004 0.00254 0.00263 0.00762 0.01660 0.154
## 188 0.00002 0.00115 0.00148 0.00345 0.01300 0.117
## 194 0.00004 0.00370 0.00390 0.01109 0.02296 0.241
## Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR status
## 74 0.01373 0.01375 0.01956 0.04120 0.00623 24.178 1
## 103 0.05551 0.05005 0.06023 0.16654 0.25930 10.489 1
## 116 0.00796 0.00832 0.01271 0.02389 0.06051 23.693 1
## 117 0.00606 0.00747 0.01312 0.01818 0.01554 26.356 1
## 118 0.00757 0.00971 0.01652 0.02270 0.01802 25.690 1
## 119 0.00617 0.00744 0.01151 0.01851 0.00856 25.020 1
## 121 0.00849 0.01117 0.01734 0.02548 0.02350 24.743 1
## 150 0.03515 0.04265 0.06460 0.10546 0.07889 14.989 1
## 187 0.00820 0.00972 0.01491 0.02460 0.01397 23.958 0
## 188 0.00631 0.00789 0.01144 0.01892 0.00680 25.023 0
## 194 0.01265 0.01321 0.01588 0.03794 0.07223 19.020 0
## RPDE DFA spread1 spread2 D2 PPE
## 74 0.509127 0.789532 -5.389129 0.306636 1.928708 0.225461
## 103 0.596362 0.641418 -3.269487 0.270641 2.690917 0.444774
## 116 0.407701 0.662668 -4.673241 0.261549 2.702355 0.274407
## 117 0.450798 0.653823 -6.051233 0.273280 2.640798 0.170106
## 118 0.486738 0.676023 -4.597834 0.372114 2.975889 0.282780
## 119 0.470422 0.655239 -4.913137 0.393056 2.816781 0.251972
## 121 0.487756 0.684130 -6.186128 0.279933 2.686240 0.152428
## 150 0.427627 0.775708 -4.892495 0.262281 2.910213 0.270173
## 187 0.566424 0.667654 -6.431119 0.153310 2.161936 0.120605
## 188 0.528485 0.663884 -6.359018 0.116636 2.152083 0.138868
## 194 0.451221 0.643956 -6.744577 0.207454 2.138608 0.123306
In this case many of the outlier measurements come from the same individual.
outl=boxplot.stats(pd2[,"MDVP.Jitter..."])$out
ind=which(pd2[,"MDVP.Jitter..."] %in% c(outl))
pd[c(ind),]
## name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 99 phon_R01_S24_2 125.791 140.557 96.206 0.01378
## 100 phon_R01_S24_3 126.512 141.756 99.770 0.01936
## 101 phon_R01_S24_4 125.641 141.068 116.346 0.03316
## 102 phon_R01_S24_5 128.451 150.449 75.632 0.01551
## 103 phon_R01_S24_6 139.224 586.567 66.157 0.03011
## 147 phon_R01_S35_1 169.774 191.759 151.451 0.01568
## 148 phon_R01_S35_2 183.520 216.814 161.340 0.01466
## 149 phon_R01_S35_3 188.620 216.302 165.982 0.01719
## 150 phon_R01_S35_4 202.632 565.740 177.258 0.01627
## 151 phon_R01_S35_5 186.695 211.961 149.442 0.01872
## 152 phon_R01_S35_6 192.818 224.429 168.793 0.03107
## 153 phon_R01_S35_7 198.116 233.099 174.478 0.02714
## 158 phon_R01_S37_5 117.963 134.209 100.757 0.01813
## 193 phon_R01_S50_4 174.688 240.005 74.287 0.01360
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 99 0.00011 0.00826 0.00655 0.02478 0.04689 0.422
## 100 0.00015 0.01159 0.00990 0.03476 0.06734 0.659
## 101 0.00026 0.02144 0.01522 0.06433 0.09178 0.891
## 102 0.00012 0.00905 0.00909 0.02716 0.06170 0.584
## 103 0.00022 0.01854 0.01628 0.05563 0.09419 0.930
## 147 0.00009 0.00863 0.00946 0.02589 0.08143 0.821
## 148 0.00008 0.00849 0.00819 0.02546 0.06050 0.618
## 149 0.00009 0.00996 0.01027 0.02987 0.07118 0.722
## 150 0.00008 0.00919 0.00963 0.02756 0.07170 0.833
## 151 0.00010 0.01075 0.01154 0.03225 0.05830 0.784
## 152 0.00016 0.01800 0.01958 0.05401 0.11908 1.302
## 153 0.00014 0.01568 0.01699 0.04705 0.08684 1.018
## 158 0.00015 0.01117 0.00718 0.03351 0.04912 0.438
## 193 0.00008 0.00624 0.00564 0.01873 0.02308 0.256
## Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR status
## 99 0.02542 0.02630 0.03908 0.07625 0.10323 15.433 1
## 100 0.03611 0.03963 0.05783 0.10833 0.16744 12.435 1
## 101 0.05358 0.04791 0.06196 0.16074 0.31482 8.867 1
## 102 0.03223 0.03672 0.05174 0.09669 0.11843 15.060 1
## 103 0.05551 0.05005 0.06023 0.16654 0.25930 10.489 1
## 147 0.03804 0.05426 0.08808 0.11411 0.07530 12.359 1
## 148 0.02865 0.04101 0.06359 0.08595 0.06057 14.367 1
## 149 0.03474 0.04580 0.06824 0.10422 0.08069 12.298 1
## 150 0.03515 0.04265 0.06460 0.10546 0.07889 14.989 1
## 151 0.02699 0.03714 0.06259 0.08096 0.10952 12.529 1
## 152 0.05647 0.07940 0.13778 0.16942 0.21713 8.441 1
## 153 0.04284 0.05556 0.08318 0.12851 0.16265 9.449 1
## 158 0.02610 0.02161 0.02916 0.07830 0.10748 19.075 1
## 193 0.01268 0.01365 0.01667 0.03804 0.10715 17.883 0
## RPDE DFA spread1 spread2 D2 PPE
## 99 0.571010 0.690892 -5.159169 0.202146 2.441612 0.260375
## 100 0.638545 0.674953 -3.760348 0.242861 2.634633 0.378483
## 101 0.671299 0.656846 -3.700544 0.260481 2.991063 0.370961
## 102 0.639808 0.643327 -4.202730 0.310163 2.638279 0.356881
## 103 0.596362 0.641418 -3.269487 0.270641 2.690917 0.444774
## 147 0.561610 0.793509 -3.297668 0.414758 3.413649 0.457533
## 148 0.478024 0.768974 -4.276605 0.355736 3.142364 0.336085
## 149 0.552870 0.764036 -3.377325 0.335357 3.274865 0.418646
## 150 0.427627 0.775708 -4.892495 0.262281 2.910213 0.270173
## 151 0.507826 0.762726 -4.484303 0.340256 2.958815 0.301487
## 152 0.625866 0.768320 -2.434031 0.450493 3.079221 0.527367
## 153 0.584164 0.754449 -2.839756 0.356224 3.184027 0.454721
## 158 0.630547 0.646786 -3.444478 0.303214 2.964568 0.261305
## 193 0.407567 0.655683 -6.787197 0.158453 2.679772 0.131728
In this case, outliers come from the same individual.
outl=boxplot.stats(pd2[,"MDVP.RAP"])$out
ind=which(pd2[,"MDVP.RAP"] %in% c(outl))
pd[c(ind),]
## name MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 98 phon_R01_S24_1 125.036 143.946 116.187 0.01280
## 99 phon_R01_S24_2 125.791 140.557 96.206 0.01378
## 100 phon_R01_S24_3 126.512 141.756 99.770 0.01936
## 101 phon_R01_S24_4 125.641 141.068 116.346 0.03316
## 102 phon_R01_S24_5 128.451 150.449 75.632 0.01551
## 103 phon_R01_S24_6 139.224 586.567 66.157 0.03011
## 147 phon_R01_S35_1 169.774 191.759 151.451 0.01568
## 148 phon_R01_S35_2 183.520 216.814 161.340 0.01466
## 149 phon_R01_S35_3 188.620 216.302 165.982 0.01719
## 150 phon_R01_S35_4 202.632 565.740 177.258 0.01627
## 151 phon_R01_S35_5 186.695 211.961 149.442 0.01872
## 152 phon_R01_S35_6 192.818 224.429 168.793 0.03107
## 153 phon_R01_S35_7 198.116 233.099 174.478 0.02714
## 158 phon_R01_S37_5 117.963 134.209 100.757 0.01813
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 98 0.00010 0.00743 0.00623 0.02228 0.03886 0.342
## 99 0.00011 0.00826 0.00655 0.02478 0.04689 0.422
## 100 0.00015 0.01159 0.00990 0.03476 0.06734 0.659
## 101 0.00026 0.02144 0.01522 0.06433 0.09178 0.891
## 102 0.00012 0.00905 0.00909 0.02716 0.06170 0.584
## 103 0.00022 0.01854 0.01628 0.05563 0.09419 0.930
## 147 0.00009 0.00863 0.00946 0.02589 0.08143 0.821
## 148 0.00008 0.00849 0.00819 0.02546 0.06050 0.618
## 149 0.00009 0.00996 0.01027 0.02987 0.07118 0.722
## 150 0.00008 0.00919 0.00963 0.02756 0.07170 0.833
## 151 0.00010 0.01075 0.01154 0.03225 0.05830 0.784
## 152 0.00016 0.01800 0.01958 0.05401 0.11908 1.302
## 153 0.00014 0.01568 0.01699 0.04705 0.08684 1.018
## 158 0.00015 0.01117 0.00718 0.03351 0.04912 0.438
## Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR status
## 98 0.02135 0.02174 0.03088 0.06406 0.08151 15.338 1
## 99 0.02542 0.02630 0.03908 0.07625 0.10323 15.433 1
## 100 0.03611 0.03963 0.05783 0.10833 0.16744 12.435 1
## 101 0.05358 0.04791 0.06196 0.16074 0.31482 8.867 1
## 102 0.03223 0.03672 0.05174 0.09669 0.11843 15.060 1
## 103 0.05551 0.05005 0.06023 0.16654 0.25930 10.489 1
## 147 0.03804 0.05426 0.08808 0.11411 0.07530 12.359 1
## 148 0.02865 0.04101 0.06359 0.08595 0.06057 14.367 1
## 149 0.03474 0.04580 0.06824 0.10422 0.08069 12.298 1
## 150 0.03515 0.04265 0.06460 0.10546 0.07889 14.989 1
## 151 0.02699 0.03714 0.06259 0.08096 0.10952 12.529 1
## 152 0.05647 0.07940 0.13778 0.16942 0.21713 8.441 1
## 153 0.04284 0.05556 0.08318 0.12851 0.16265 9.449 1
## 158 0.02610 0.02161 0.02916 0.07830 0.10748 19.075 1
## RPDE DFA spread1 spread2 D2 PPE
## 98 0.629574 0.714485 -4.020042 0.265315 2.671825 0.340623
## 99 0.571010 0.690892 -5.159169 0.202146 2.441612 0.260375
## 100 0.638545 0.674953 -3.760348 0.242861 2.634633 0.378483
## 101 0.671299 0.656846 -3.700544 0.260481 2.991063 0.370961
## 102 0.639808 0.643327 -4.202730 0.310163 2.638279 0.356881
## 103 0.596362 0.641418 -3.269487 0.270641 2.690917 0.444774
## 147 0.561610 0.793509 -3.297668 0.414758 3.413649 0.457533
## 148 0.478024 0.768974 -4.276605 0.355736 3.142364 0.336085
## 149 0.552870 0.764036 -3.377325 0.335357 3.274865 0.418646
## 150 0.427627 0.775708 -4.892495 0.262281 2.910213 0.270173
## 151 0.507826 0.762726 -4.484303 0.340256 2.958815 0.301487
## 152 0.625866 0.768320 -2.434031 0.450493 3.079221 0.527367
## 153 0.584164 0.754449 -2.839756 0.356224 3.184027 0.454721
## 158 0.630547 0.646786 -3.444478 0.303214 2.964568 0.261305
Here, two individuals are “outliers.”
Dealing with outliers by either removing them or replacing them with the mean/median seems unnecessary - the data is just skewed in nature. Removing outliers would make the analysis biased.
So I decide to keep them. Since the data is not normally distributed (many predictors are skewed)
pca_pd=prcomp(pd2, scale=TRUE)
str(pca_pd)
## List of 5
## $ sdev : num [1:22] 3.6 1.577 1.242 1.21 0.987 ...
## $ rotation: num [1:22, 1:22] 0.05333 -0.00671 0.06382 -0.25451 -0.24168 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
## .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:22] 1.54e+02 1.97e+02 1.16e+02 6.22e-03 4.40e-05 ...
## ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
## $ scale : Named num [1:22] 4.14e+01 9.15e+01 4.35e+01 4.85e-03 3.48e-05 ...
## ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
## $ x : num [1:195, 1:22] -2.09 -4.7 -3.84 -4.12 -5.68 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:195] "phon_R01_S01_1" "phon_R01_S01_2" "phon_R01_S01_3" "phon_R01_S01_4" ...
## .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
#library(devtools)
#install_github("vqv/ggbiplot")
#library(ggbiplot)
ggbiplot(pca_pd)
biplot(pca_pd, scale=0)
ggbiplot(pca_pd, labels=rownames(pd2))
disease_stat=ifelse(pd$status=="1", "PD", "HLT")
ggbiplot(pca_pd,ellipse=TRUE, groups=disease_stat)
The PD measurements seem to be clustered to the bottom
head(pca_pd$rotation)
## PC1 PC2 PC3 PC4 PC5
## MDVP.Fo.Hz. 0.053331113 0.55340103 -0.1282881 0.1312285 0.1150947947
## MDVP.Fhi.Hz. -0.006712501 0.34878153 -0.2676430 -0.2409897 0.1877101044
## MDVP.Flo.Hz. 0.063819423 0.39548027 0.2328452 0.2200752 0.2858261098
## MDVP.Jitter... -0.254513242 0.08178929 0.1506745 -0.2516952 0.0711150522
## MDVP.Jitter.Abs. -0.241680694 -0.07677255 0.1852899 -0.3081169 -0.0008538928
## MDVP.RAP -0.249823572 0.11603792 0.1692132 -0.2576196 0.0258294215
## PC6 PC7 PC8 PC9 PC10
## MDVP.Fo.Hz. 0.14766861 -0.003505647 -0.216748419 -0.064192702 0.64532341
## MDVP.Fhi.Hz. -0.71903648 0.369740613 -0.054245149 0.153286499 -0.17370588
## MDVP.Flo.Hz. 0.46768146 0.455281251 0.007371196 0.079323633 -0.45949396
## MDVP.Jitter... 0.04166932 -0.021401088 0.032194922 0.012646201 0.11005180
## MDVP.Jitter.Abs. 0.03927228 0.004741437 0.088529516 -0.002100416 -0.13903402
## MDVP.RAP 0.07737102 -0.045632716 0.034029356 0.056594488 0.08351011
## PC11 PC12 PC13 PC14 PC15
## MDVP.Fo.Hz. -0.107077243 0.25618422 -0.136797666 0.04811878 0.06876995
## MDVP.Fhi.Hz. -0.006748088 -0.00293302 0.003450677 -0.02033489 0.02663274
## MDVP.Flo.Hz. -0.072647366 -0.05807287 0.025845377 -0.02589691 0.00625575
## MDVP.Jitter... 0.077534938 0.02693779 0.157434525 -0.06472545 0.05479457
## MDVP.Jitter.Abs. -0.257337393 0.15269365 0.221431558 0.31804839 0.27478004
## MDVP.RAP 0.035878080 0.11951169 0.125351563 -0.23329677 0.05777257
## PC16 PC17 PC18 PC19
## MDVP.Fo.Hz. -0.220928548 0.00253487 -0.012170838 -5.289131e-02
## MDVP.Fhi.Hz. -0.001394872 0.04206616 0.007020829 -5.063792e-05
## MDVP.Flo.Hz. 0.026583245 -0.01825606 -0.010846368 2.240354e-02
## MDVP.Jitter... 0.001179548 -0.16366562 -0.445880827 7.448806e-01
## MDVP.Jitter.Abs. -0.633146349 0.10389484 -0.034033642 -2.186430e-01
## MDVP.RAP 0.245194725 0.19327305 0.345752801 -5.354520e-02
## PC20 PC21 PC22
## MDVP.Fo.Hz. 0.0007821080 -1.748182e-04 1.700574e-05
## MDVP.Fhi.Hz. -0.0042671358 -1.990487e-05 -1.454460e-05
## MDVP.Flo.Hz. -0.0005192626 6.257515e-05 -2.930880e-05
## MDVP.Jitter... -0.0525475476 2.417675e-04 -5.619788e-05
## MDVP.Jitter.Abs. -0.0427786398 -1.893345e-04 -1.238837e-04
## MDVP.RAP -0.0146062154 7.067090e-01 -2.157490e-02
Proprotion of Variance Explained by PC1
pr.var=pca_pd$sdev^2
pve=pr.var/sum(pr.var)
par(mfrow=c(1,2))
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained ", ylim=c(0,1),type="b")
plot(cumsum(pve), xlab="Principal Component ", ylab=" Cumulative Proportion of Variance Explained ", ylim=c(0,1), type="b")
PC1 explains 58.9% of the variance.
Clustering with K=2
km.pd2=kmeans(pd2,2,nstart=20)
head(km.pd2$cluster)
## phon_R01_S01_1 phon_R01_S01_2 phon_R01_S01_3 phon_R01_S01_4 phon_R01_S01_5
## 2 2 2 2 2
## phon_R01_S01_6
## 2
head(pd$status)
## [1] 1 1 1 1 1 1
## Levels: 0 1
s=ifelse(pd$status==1, 1, 2) #so the colors match
head(s)
## [1] 1 1 1 1 1 1
par(mfrow=c(1,2))
for (i in colnames(pd2))
{plot(pd2[,i], col=(km.pd2$cluster), main="K-Means Clustering; K=2", xlab="", ylab=i, pch=20, cex=2)
plot(pd2[,i], col=(s), main="By Status; black=PD", xlab="", ylab=i, pch=20, cex=2)}
pd2=cbind(pd2, pd$status)
colnames(pd2)=c("MDVP.Fo.Hz.", "MDVP.Fhi.Hz." , "MDVP.Flo.Hz." , "MDVP.Jitter..." , "MDVP.Jitter.Abs." ,"MDVP.RAP" , "MDVP.PPQ" , "Jitter.DDP" , "MDVP.Shimmer" , "MDVP.Shimmer.dB." ,"Shimmer.APQ3" , "Shimmer.APQ5" ,
"MDVP.APQ" , "Shimmer.DDA" , "NHR" , "HNR" , "RPDE" , "DFA" ,
"spread1" , "spread2" , "D2" , "PPE" , "status")
The data is skewed, so in order to normalize it, I log transform all the values. The values for spread1 are negative, so I add a constant to all the values.
offset=min(pd2[,"spread1"])
offset=-offset
offset
## [1] 7.964984
pd2norm=log(offset+1+pd2[1:22])
pd2norm=cbind(pd2norm, pd2[,"status"])
head(pd2norm)
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## phon_R01_S01_1 4.859479 5.113595 4.430364 2.194200
## phon_R01_S01_2 4.877980 5.060155 4.810427 2.194405
## phon_R01_S01_3 4.833476 4.942185 4.791816 2.194497
## phon_R01_S01_4 4.833429 4.989316 4.790246 2.194438
## phon_R01_S01_5 4.828146 5.015596 4.784320 2.194758
## phon_R01_S01_6 4.863812 4.942549 4.810166 2.194405
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer
## phon_R01_S01_1 2.193334 2.193739 2.193944 2.194563 2.198193
## phon_R01_S01_2 2.193335 2.193845 2.194102 2.194880 2.200145
## phon_R01_S01_3 2.193336 2.193933 2.194197 2.195146 2.199147
## phon_R01_S01_4 2.193336 2.193886 2.194105 2.195004 2.199434
## phon_R01_S01_5 2.193339 2.194057 2.194339 2.195517 2.200468
## phon_R01_S01_6 2.193335 2.193843 2.194163 2.194873 2.198556
## MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA
## phon_R01_S01_1 2.239750 2.195757 2.196812 2.196635 2.200600
## phon_R01_S01_2 2.260823 2.196816 2.198353 2.198187 2.203760
## phon_R01_S01_3 2.245696 2.196397 2.197620 2.197323 2.202509
## phon_R01_S01_4 2.249394 2.196583 2.197784 2.197525 2.203062
## phon_R01_S01_5 2.256435 2.197212 2.198694 2.198294 2.204937
## phon_R01_S01_6 2.242940 2.195920 2.197252 2.196937 2.201088
## NHR HNR RPDE DFA spread1 spread2 D2
## phon_R01_S01_1 2.195790 3.401130 2.238555 2.280367 1.423579 2.222618 2.421827
## phon_R01_S01_2 2.195476 3.333988 2.243190 2.280800 1.587150 2.230076 2.438150
## phon_R01_S01_3 2.194785 3.388314 2.240165 2.281389 1.508911 2.227447 2.425443
## phon_R01_S01_4 2.194834 3.388078 2.240705 2.280771 1.578460 2.229921 2.431026
## phon_R01_S01_5 2.195295 3.353896 2.238829 2.281205 1.651960 2.219149 2.424552
## phon_R01_S01_6 2.194688 3.412565 2.238638 2.281367 1.552257 2.226146 2.411668
## PPE pd2[, "status"]
## phon_R01_S01_1 2.224584 1
## phon_R01_S01_2 2.233627 1
## phon_R01_S01_3 2.229758 1
## phon_R01_S01_4 2.233659 1
## phon_R01_S01_5 2.238081 1
## phon_R01_S01_6 2.232459 1
colnames(pd2norm)=c("MDVP.Fo.Hz.", "MDVP.Fhi.Hz." , "MDVP.Flo.Hz." , "MDVP.Jitter..." , "MDVP.Jitter.Abs." ,"MDVP.RAP" , "MDVP.PPQ" , "Jitter.DDP" , "MDVP.Shimmer" , "MDVP.Shimmer.dB." ,"Shimmer.APQ3" , "Shimmer.APQ5" ,
"MDVP.APQ" , "Shimmer.DDA" , "NHR" , "HNR" , "RPDE" , "DFA" ,
"spread1" , "spread2" , "D2" , "PPE" , "status")
Plotting Again
par(mfrow=c(3,3))
for (i in colnames(pd2norm[1:22]))
{hist(pd2norm[,i], main=i)}
par(mfrow=c(1,3))
for (i in colnames(pd2norm[1:22]))
{boxplot(pd2norm[,i], main=i)}
PCA Again
pca_pd=prcomp(pd2norm[1:22], scale=TRUE)
str(pca_pd)
## List of 5
## $ sdev : num [1:22] 3.592 1.631 1.262 1.202 0.984 ...
## $ rotation: num [1:22, 1:22] 0.048 0.0032 0.0612 -0.2549 -0.242 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
## .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:22] 5.06 5.26 4.78 2.19 2.19 ...
## ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
## $ scale : Named num [1:22] 2.47e-01 3.48e-01 3.12e-01 5.40e-04 3.88e-06 ...
## ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
## $ x : num [1:195, 1:22] -2.09 -4.63 -3.81 -4.06 -5.58 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:195] "phon_R01_S01_1" "phon_R01_S01_2" "phon_R01_S01_3" "phon_R01_S01_4" ...
## .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
#library(devtools)
#install_github("vqv/ggbiplot")
#library(ggbiplot)
ggbiplot(pca_pd)
biplot(pca_pd, scale=0)
ggbiplot(pca_pd, labels=rownames(pd2))
disease_stat=ifelse(pd$status=="1", "PD", "HLT")
ggbiplot(pca_pd,ellipse=TRUE, groups=disease_stat)
The PD measurements seem to be clustered to the bottom
#pca_pd$rotation
I don’t end up using the normalized data because it has weird output.
Since there are only 8 healthy individuals (around 84 measurements for healthy vs 147 for diseased), I make sure to include a sufficient number in the training and test sets, so I split them into healthy vs diseased, sample from each, and then combine them into training and test sets.
pd_dis_idx=which(pd2$status==1) #indeces of PD individuals
pd_health_idx=which(pd2$status==0) #indeces of healthy individuals
#pd_health_idx
#pd_dis_idx
set.seed(4706)
#nrow(pd_dis)
#nrow(pd_health)
#73 diseased test
#24 healthy test
#74 diseases train
#24 healthy train
test_health=sample(pd_health_idx, 24, replace = FALSE) #sample from indeces
#test_health
test_dis=sample(pd_dis_idx, 73, replace = FALSE) #sample from indeces
#test_dis
test=append(test_dis, test_health) #test set
tot=1:nrow(pd2)
train=tot[-test] #training set
y.train=pd2[c(train), "status"]
y.test=pd2[c(test), "status"]
y.train
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [77] 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0
## Levels: 0 1
y.test
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
## [77] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
regfit_full=regsubsets(status~., data=pd2, subset=train, nvmax=23) #nvmax=p
reg_full_sum=summary(regfit_full) #returns R^2, adjusted R^2, AIC, BIC, Cp
reg_full_sum
## Subset selection object
## Call: regsubsets.formula(status ~ ., data = pd2, subset = train, nvmax = 23)
## 22 Variables (and intercept)
## Forced in Forced out
## MDVP.Fo.Hz. FALSE FALSE
## MDVP.Fhi.Hz. FALSE FALSE
## MDVP.Flo.Hz. FALSE FALSE
## MDVP.Jitter... FALSE FALSE
## MDVP.Jitter.Abs. FALSE FALSE
## MDVP.RAP FALSE FALSE
## MDVP.PPQ FALSE FALSE
## Jitter.DDP FALSE FALSE
## MDVP.Shimmer FALSE FALSE
## MDVP.Shimmer.dB. FALSE FALSE
## Shimmer.APQ3 FALSE FALSE
## Shimmer.APQ5 FALSE FALSE
## MDVP.APQ FALSE FALSE
## Shimmer.DDA FALSE FALSE
## NHR FALSE FALSE
## HNR FALSE FALSE
## RPDE FALSE FALSE
## DFA FALSE FALSE
## spread1 FALSE FALSE
## spread2 FALSE FALSE
## D2 FALSE FALSE
## PPE FALSE FALSE
## 1 subsets of each size up to 22
## Selection Algorithm: exhaustive
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter... MDVP.Jitter.Abs.
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " "
## 7 ( 1 ) " " " " "*" " " " "
## 8 ( 1 ) " " " " "*" " " " "
## 9 ( 1 ) "*" " " " " " " "*"
## 10 ( 1 ) "*" " " "*" " " "*"
## 11 ( 1 ) "*" " " "*" " " "*"
## 12 ( 1 ) "*" " " "*" " " "*"
## 13 ( 1 ) "*" " " "*" " " "*"
## 14 ( 1 ) "*" " " "*" " " "*"
## 15 ( 1 ) "*" " " "*" " " "*"
## 16 ( 1 ) "*" " " "*" " " "*"
## 17 ( 1 ) "*" " " "*" " " "*"
## 18 ( 1 ) "*" " " "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*" "*"
## 21 ( 1 ) "*" "*" "*" "*" "*"
## 22 ( 1 ) "*" "*" "*" "*" "*"
## MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) "*" "*" " " " " " "
## 5 ( 1 ) " " "*" "*" " " "*"
## 6 ( 1 ) " " "*" "*" " " "*"
## 7 ( 1 ) " " "*" "*" " " "*"
## 8 ( 1 ) "*" "*" " " " " "*"
## 9 ( 1 ) "*" "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " "*"
## 11 ( 1 ) "*" "*" " " " " "*"
## 12 ( 1 ) "*" "*" " " "*" "*"
## 13 ( 1 ) " " "*" "*" "*" "*"
## 14 ( 1 ) "*" "*" " " "*" "*"
## 15 ( 1 ) " " "*" "*" "*" "*"
## 16 ( 1 ) " " "*" "*" "*" "*"
## 17 ( 1 ) " " "*" "*" "*" "*"
## 18 ( 1 ) " " "*" "*" "*" "*"
## 19 ( 1 ) " " "*" "*" "*" "*"
## 20 ( 1 ) " " "*" "*" "*" "*"
## 21 ( 1 ) "*" "*" "*" "*" "*"
## 22 ( 1 ) "*" "*" "*" "*" "*"
## Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR RPDE DFA
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " " " " " " " " " " " " " "*"
## 5 ( 1 ) " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" " " " " " " " " " " " " "*"
## 7 ( 1 ) "*" " " " " " " " " " " " " "*"
## 8 ( 1 ) "*" " " " " " " " " " " "*" "*"
## 9 ( 1 ) " " "*" " " " " " " " " "*" "*"
## 10 ( 1 ) "*" " " " " " " " " " " "*" "*"
## 11 ( 1 ) "*" " " "*" " " " " " " "*" "*"
## 12 ( 1 ) "*" " " "*" " " " " " " "*" "*"
## 13 ( 1 ) "*" "*" "*" " " " " " " "*" "*"
## 14 ( 1 ) "*" " " "*" "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*"
## 21 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*"
## 22 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## spread1 spread2 D2 PPE
## 1 ( 1 ) "*" " " " " " "
## 2 ( 1 ) "*" " " " " " "
## 3 ( 1 ) "*" " " " " "*"
## 4 ( 1 ) "*" " " " " " "
## 5 ( 1 ) "*" " " " " " "
## 6 ( 1 ) "*" " " " " " "
## 7 ( 1 ) "*" " " " " " "
## 8 ( 1 ) "*" " " " " " "
## 9 ( 1 ) "*" " " " " " "
## 10 ( 1 ) "*" " " " " " "
## 11 ( 1 ) "*" " " " " " "
## 12 ( 1 ) "*" " " " " " "
## 13 ( 1 ) "*" " " " " " "
## 14 ( 1 ) "*" "*" " " " "
## 15 ( 1 ) "*" "*" " " " "
## 16 ( 1 ) "*" "*" " " " "
## 17 ( 1 ) "*" "*" "*" " "
## 18 ( 1 ) "*" "*" "*" " "
## 19 ( 1 ) "*" "*" "*" " "
## 20 ( 1 ) "*" "*" "*" "*"
## 21 ( 1 ) "*" "*" "*" "*"
## 22 ( 1 ) "*" "*" "*" "*"
names(reg_full_sum)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
names(regfit_full)
## [1] "np" "nrbar" "d" "rbar" "thetab" "first"
## [7] "last" "vorder" "tol" "rss" "bound" "nvmax"
## [13] "ress" "ir" "nbest" "lopt" "il" "ier"
## [19] "xnames" "method" "force.in" "force.out" "sserr" "intercept"
## [25] "lindep" "nullrss" "nn" "call"
Plotting
par(mfrow=c(2,2))
#RSS
plot(reg_full_sum$rss, xlab="Number of variables", ylab="RSS", type ="l")
mnrss=which.min(reg_full_sum$rss)
#Plotting min RSS
points(mnrss, reg_full_sum$adjr2[mnrss], col="red", cex=2, pch=20)
#Adjusted R^2
plot(reg_full_sum$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type ="l")
mxr2=which.max(reg_full_sum$adjr2)
#Plotting max adjusted R^2
points(mxr2, reg_full_sum$adjr2[mxr2], col="red", cex=2, pch=20)
#CP
plot(reg_full_sum$cp, xlab = "Number of variables", ylab="CP", type = "l")
mncp=which.min(reg_full_sum$cp)
points(mncp, reg_full_sum$cp[mncp], col="blue", cex=2, pch=20)
#BIC
plot(reg_full_sum$bic, xlab="Number of variables", ylab = "BIC", type="l")
mnbic=which.min(reg_full_sum$bic)
points(mnbic, reg_full_sum$bic[mnbic], col="green", pch=20, cex=2)
BIC selects for a smaller model (5 predictors) (heavier penalty)
CP min chooses around 6.
Adjusted R^2 max is around 8.
As expected, RSS decreases as more predictors are added, but this is not indicative of test error rate.
Plotting Selected Variables
plot(regfit_full, scale="r2")
plot(regfit_full, scale="adjr2")
plot(regfit_full, scale="Cp")
plot(regfit_full, scale="bic")
Coef of models
"model with highest adjusted R^2"
## [1] "model with highest adjusted R^2"
coef(regfit_full, mxr2) #model with highest adjusted R^2
## (Intercept) MDVP.Flo.Hz. MDVP.RAP MDVP.PPQ
## 2.746547e+00 -1.486939e-03 1.527527e+02 -2.698480e+02
## MDVP.Shimmer.dB. Shimmer.APQ3 RPDE DFA
## 2.178926e+00 -2.396256e+01 -6.369941e-01 2.261761e+00
## spread1
## 3.434226e-01
"model with lowest CP"
## [1] "model with lowest CP"
coef(regfit_full, mncp) #model with lowest CP
## (Intercept) MDVP.PPQ Jitter.DDP MDVP.Shimmer.dB.
## 1.9776112 -265.6049616 51.6839298 1.8572010
## Shimmer.APQ3 DFA spread1
## -20.8234354 2.5707226 0.3296206
"model with lowest BIC"
## [1] "model with lowest BIC"
coef(regfit_full, mnbic) #model with lowest BIC
## (Intercept) MDVP.PPQ Jitter.DDP MDVP.Shimmer.dB.
## 2.0008538 -209.8104429 38.6909884 0.7094366
## DFA spread1
## 2.3236992 0.3135981
x=model.matrix(status~., pd2)[,-1] #[,-1] to remove the intercept
y=pd2$status
#automatically transforms any qualitative variables into dummy variables
#standardizes by default
#alpha=0 ridge regression, alpha=1 lasso
grid=10^seq(10, -2, length=100) #10 to the power of a sequence -> our lambda values
grid
## [1] 1.000000e+10 7.564633e+09 5.722368e+09 4.328761e+09 3.274549e+09
## [6] 2.477076e+09 1.873817e+09 1.417474e+09 1.072267e+09 8.111308e+08
## [11] 6.135907e+08 4.641589e+08 3.511192e+08 2.656088e+08 2.009233e+08
## [16] 1.519911e+08 1.149757e+08 8.697490e+07 6.579332e+07 4.977024e+07
## [21] 3.764936e+07 2.848036e+07 2.154435e+07 1.629751e+07 1.232847e+07
## [26] 9.326033e+06 7.054802e+06 5.336699e+06 4.037017e+06 3.053856e+06
## [31] 2.310130e+06 1.747528e+06 1.321941e+06 1.000000e+06 7.564633e+05
## [36] 5.722368e+05 4.328761e+05 3.274549e+05 2.477076e+05 1.873817e+05
## [41] 1.417474e+05 1.072267e+05 8.111308e+04 6.135907e+04 4.641589e+04
## [46] 3.511192e+04 2.656088e+04 2.009233e+04 1.519911e+04 1.149757e+04
## [51] 8.697490e+03 6.579332e+03 4.977024e+03 3.764936e+03 2.848036e+03
## [56] 2.154435e+03 1.629751e+03 1.232847e+03 9.326033e+02 7.054802e+02
## [61] 5.336699e+02 4.037017e+02 3.053856e+02 2.310130e+02 1.747528e+02
## [66] 1.321941e+02 1.000000e+02 7.564633e+01 5.722368e+01 4.328761e+01
## [71] 3.274549e+01 2.477076e+01 1.873817e+01 1.417474e+01 1.072267e+01
## [76] 8.111308e+00 6.135907e+00 4.641589e+00 3.511192e+00 2.656088e+00
## [81] 2.009233e+00 1.519911e+00 1.149757e+00 8.697490e-01 6.579332e-01
## [86] 4.977024e-01 3.764936e-01 2.848036e-01 2.154435e-01 1.629751e-01
## [91] 1.232847e-01 9.326033e-02 7.054802e-02 5.336699e-02 4.037017e-02
## [96] 3.053856e-02 2.310130e-02 1.747528e-02 1.321941e-02 1.000000e-02
ridge.mod=glmnet(x, y, alpha=0, lambda=grid, family = "binomial")
#rows (predictors), columns (lambda values)
ridge.mod$lambda[50] #lambda value
## [1] 11497.57
coef(ridge.mod)[,50] #coef for this lambda value (index)
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## 1.118985e+00 -3.480464e-07 -6.820632e-08 -3.281221e-07
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ
## 2.155138e-03 3.652407e-01 3.374402e-03 3.929665e-03
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## 1.124699e-03 7.317776e-04 6.758354e-05 1.285790e-03
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## 1.096786e-03 8.073509e-04 4.285862e-04 1.759856e-04
## HNR RPDE DFA spread1
## -3.067666e-06 1.114926e-04 1.572944e-04 1.945867e-05
## spread2 D2 PPE
## 2.048184e-04 3.338087e-05 2.213090e-04
Estimating Test Error
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12, family = "binomial")
ridge.pred=predict(ridge.mod,s=4,newx=x[test,])
mean((ridge.pred-y.test)^2)
## Warning in Ops.factor(ridge.pred, y.test): '-' not meaningful for factors
## [1] NA
#check vs least squares lambda=0
ridge.pred=predict(ridge.mod, s=0, newx=x[test,], exact = T, x=x[train,], y=y[train])
## Warning: from glmnet Fortran code (error code -101); Convergence for 101th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
mean((ridge.pred-y.test)^2)
## Warning in Ops.factor(ridge.pred, y.test): '-' not meaningful for factors
## [1] NA
lm(y~x, subset = train)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
##
## Call:
## lm(formula = y ~ x, subset = train)
##
## Coefficients:
## (Intercept) xMDVP.Fo.Hz. xMDVP.Fhi.Hz. xMDVP.Flo.Hz.
## 2.543e+00 -2.456e-03 1.312e-04 -1.156e-03
## xMDVP.Jitter... xMDVP.Jitter.Abs. xMDVP.RAP xMDVP.PPQ
## -2.589e+01 -8.314e+03 -4.337e+02 -1.994e+02
## xJitter.DDP xMDVP.Shimmer xMDVP.Shimmer.dB. xShimmer.APQ3
## 2.222e+02 4.607e+01 1.584e+00 -9.141e+03
## xShimmer.APQ5 xMDVP.APQ xShimmer.DDA xNHR
## -1.380e+01 -1.707e+01 3.027e+03 -5.984e-02
## xHNR xRPDE xDFA xspread1
## 9.963e-03 -4.796e-01 1.921e+00 3.055e-01
## xspread2 xD2 xPPE
## 3.100e-01 5.409e-02 3.579e-01
predict(ridge.mod,s=0, exact = T, type="coefficients", x=x[train,], y=y[train])[1:20,]
## Warning: from glmnet Fortran code (error code -101); Convergence for 101th
## lambda value not reached after maxit=100000 iterations; solutions for larger
## lambdas returned
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## -4.530045e+00 -8.450098e-03 -1.067700e-03 -3.164070e-03
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ
## -4.930979e+01 -1.025792e+04 3.104202e+01 -9.981425e+01
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## 1.041818e+01 2.190459e+01 2.310188e+00 1.219492e+01
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## 2.613212e+01 3.634479e+01 4.069749e+00 1.922760e+00
## HNR RPDE DFA spread1
## 8.991673e-02 -3.457758e+00 9.767619e+00 9.881682e-01
Cross Validation for Ridge
set.seed(4706)
cv.out=cv.glmnet(x[train,], y[train], alpha=0, family = "binomial")
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
## [1] 0.03760963
#predict on test using best lambda
ridge.pred=predict(ridge.mod, s=bestlam, newx=x[test,])
mean((ridge.pred-y.test)^2)
## Warning in Ops.factor(ridge.pred, y.test): '-' not meaningful for factors
## [1] NA
#refit on full data
out=glmnet(x,y,alpha=0, family = "binomial")
predict(out,type="coefficients",s=bestlam)[1:20,]
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## -1.053252e+00 -6.012875e-03 -2.183152e-03 -2.945454e-03
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ
## -2.128719e+01 -1.543317e+03 3.025824e+01 -5.665370e+00
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## 1.008551e+01 8.520327e+00 6.977416e-01 6.524013e+00
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## 1.387721e+01 1.560451e+01 2.191096e+00 -3.414915e+00
## HNR RPDE DFA spread1
## 5.994621e-03 -1.577644e+00 3.156172e+00 5.734877e-01
#Fitting on training data
lasso.mod=glmnet(x[train,], y[train], alpha=1, lambda=grid, family="binomial")
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
#Getting best lambda
set.seed(4706)
cv.out=cv.glmnet(x[train,], y[train], alpha=1, family="binomial")
plot(cv.out)
bestlam=cv.out$lambda.min #lambda.min=lambda with lowest error
bestlam
## [1] 0.02307692
lasso.pred=predict(lasso.mod, s=bestlam, newx=x[test,])
mean((lasso.pred!=y.test))
## [1] 1
#Refitting on full data using best lambda
out=glmnet(x, y, alpha=1, lambda=grid, family="binomial")
lasso.coef=predict(out, type="coefficients", s=bestlam)
lasso.coef
## 23 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 5.047928708
## MDVP.Fo.Hz. -0.001916708
## MDVP.Fhi.Hz. -0.001457227
## MDVP.Flo.Hz. .
## MDVP.Jitter... .
## MDVP.Jitter.Abs. .
## MDVP.RAP .
## MDVP.PPQ .
## Jitter.DDP .
## MDVP.Shimmer .
## MDVP.Shimmer.dB. .
## Shimmer.APQ3 .
## Shimmer.APQ5 .
## MDVP.APQ 13.251313646
## Shimmer.DDA .
## NHR .
## HNR .
## RPDE .
## DFA 2.568419871
## spread1 1.364326482
## spread2 3.328087998
## D2 0.961838161
## PPE .
Predictors chosen by lasso: MDVP.Fo.Hz.+MDVP.Fhi.Hz+MDVP.APQ+DFA+spread1+spread2+D2
I fit a logistic regression model on the training data
logist_reg_fit=glm(status~., data=pd2, family=binomial, subset=train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logist_reg_fit)
##
## Call:
## glm(formula = status ~ ., family = binomial, data = pd2, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.25209 0.00000 0.00454 0.14252 1.50515
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.857e+01 4.922e+01 -0.377 0.7059
## MDVP.Fo.Hz. 3.087e-03 5.866e-02 0.053 0.9580
## MDVP.Fhi.Hz. 3.238e-03 1.513e-02 0.214 0.8305
## MDVP.Flo.Hz. -1.936e-02 3.836e-02 -0.505 0.6138
## MDVP.Jitter... -1.497e+03 2.542e+03 -0.589 0.5561
## MDVP.Jitter.Abs. -2.389e+05 2.527e+05 -0.945 0.3446
## MDVP.RAP -6.776e+04 2.884e+05 -0.235 0.8143
## MDVP.PPQ -1.864e+03 3.768e+03 -0.495 0.6207
## Jitter.DDP 2.365e+04 9.598e+04 0.246 0.8053
## MDVP.Shimmer 5.099e+03 4.432e+03 1.151 0.2499
## MDVP.Shimmer.dB. 8.412e+01 1.231e+02 0.683 0.4944
## Shimmer.APQ3 1.352e+05 3.108e+05 0.435 0.6636
## Shimmer.APQ5 -2.056e+03 1.710e+03 -1.202 0.2292
## MDVP.APQ -1.068e+03 1.249e+03 -0.855 0.3925
## Shimmer.DDA -4.701e+04 1.037e+05 -0.453 0.6502
## NHR 8.646e+01 1.023e+02 0.845 0.3982
## HNR 4.297e-01 5.681e-01 0.756 0.4494
## RPDE 4.853e-01 9.039e+00 0.054 0.9572
## DFA 5.860e+01 2.999e+01 1.954 0.0507 .
## spread1 6.140e+00 6.336e+00 0.969 0.3326
## spread2 -3.070e+00 1.360e+01 -0.226 0.8214
## D2 9.834e-01 2.602e+00 0.378 0.7055
## PPE 5.662e+00 6.918e+01 0.082 0.9348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 109.105 on 97 degrees of freedom
## Residual deviance: 32.917 on 75 degrees of freedom
## AIC: 78.917
##
## Number of Fisher Scoring iterations: 11
#"Coefficients"
#coef(logist_reg_fit)
#"P-Values"
#summary(logist_reg_fit)$coef[ ,4]
step(logist_reg_fit, direction = "forward")
## Start: AIC=78.92
## status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... +
## MDVP.Jitter.Abs. + MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer +
## MDVP.Shimmer.dB. + Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ +
## Shimmer.DDA + NHR + HNR + RPDE + DFA + spread1 + spread2 +
## D2 + PPE
##
## Call: glm(formula = status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.Flo.Hz. +
## MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.RAP + MDVP.PPQ +
## Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 +
## Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + HNR + RPDE +
## DFA + spread1 + spread2 + D2 + PPE, family = binomial, data = pd2,
## subset = train)
##
## Coefficients:
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## -1.857e+01 3.087e-03 3.238e-03 -1.936e-02
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ
## -1.497e+03 -2.389e+05 -6.776e+04 -1.864e+03
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## 2.365e+04 5.099e+03 8.412e+01 1.352e+05
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## -2.056e+03 -1.068e+03 -4.701e+04 8.646e+01
## HNR RPDE DFA spread1
## 4.297e-01 4.853e-01 5.860e+01 6.140e+00
## spread2 D2 PPE
## -3.070e+00 9.834e-01 5.662e+00
##
## Degrees of Freedom: 97 Total (i.e. Null); 75 Residual
## Null Deviance: 109.1
## Residual Deviance: 32.92 AIC: 78.92
MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.Flo.Hz.+MDVP.Jitter…+MDVP.Jitter.Abs.+MDVP.RAP
step(logist_reg_fit, direction = "backward")
## Start: AIC=78.92
## status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... +
## MDVP.Jitter.Abs. + MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer +
## MDVP.Shimmer.dB. + Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ +
## Shimmer.DDA + NHR + HNR + RPDE + DFA + spread1 + spread2 +
## D2 + PPE
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - MDVP.Fo.Hz. 1 32.920 76.920
## - RPDE 1 32.920 76.920
## - PPE 1 32.924 76.924
## - spread2 1 32.969 76.969
## - MDVP.RAP 1 32.971 76.971
## - MDVP.Fhi.Hz. 1 32.973 76.973
## - Jitter.DDP 1 32.977 76.977
## - D2 1 33.062 77.062
## - Shimmer.APQ3 1 33.127 77.127
## - Shimmer.DDA 1 33.147 77.147
## - MDVP.PPQ 1 33.172 77.172
## - MDVP.Flo.Hz. 1 33.192 77.192
## - MDVP.Jitter... 1 33.281 77.281
## - MDVP.Shimmer.dB. 1 33.453 77.453
## - HNR 1 33.568 77.568
## - MDVP.APQ 1 33.709 77.709
## - NHR 1 33.838 77.838
## - MDVP.Jitter.Abs. 1 33.954 77.954
## - MDVP.Shimmer 1 34.551 78.551
## - spread1 1 34.758 78.758
## - Shimmer.APQ5 1 34.780 78.780
## <none> 32.917 78.917
## - DFA 1 38.827 82.827
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=76.92
## status ~ MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. +
## MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. +
## Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR +
## HNR + RPDE + DFA + spread1 + spread2 + D2 + PPE
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - RPDE 1 32.925 74.925
## - PPE 1 32.930 74.930
## - MDVP.RAP 1 32.972 74.972
## - spread2 1 32.976 74.976
## - Jitter.DDP 1 32.977 74.977
## - MDVP.Fhi.Hz. 1 32.987 74.987
## - D2 1 33.086 75.086
## - Shimmer.APQ3 1 33.131 75.131
## - Shimmer.DDA 1 33.150 75.150
## - MDVP.PPQ 1 33.173 75.173
## - MDVP.Flo.Hz. 1 33.240 75.240
## - MDVP.Jitter... 1 33.283 75.283
## - MDVP.Shimmer.dB. 1 33.560 75.560
## - HNR 1 33.584 75.584
## - MDVP.APQ 1 33.725 75.725
## - NHR 1 33.855 75.855
## - MDVP.Shimmer 1 34.775 76.775
## - spread1 1 34.819 76.819
## <none> 32.920 76.920
## - MDVP.Jitter.Abs. 1 35.153 77.153
## - Shimmer.APQ5 1 35.156 77.156
## - DFA 1 43.848 85.848
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=74.92
## status ~ MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. +
## MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. +
## Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR +
## HNR + DFA + spread1 + spread2 + D2 + PPE
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - PPE 1 32.936 72.936
## - MDVP.RAP 1 32.972 72.972
## - Jitter.DDP 1 32.978 72.978
## - spread2 1 32.991 72.991
## - MDVP.Fhi.Hz. 1 33.004 73.004
## - D2 1 33.111 73.111
## - Shimmer.APQ3 1 33.131 73.131
## - Shimmer.DDA 1 33.150 73.150
## - MDVP.PPQ 1 33.219 73.219
## - MDVP.Flo.Hz. 1 33.241 73.241
## - MDVP.Jitter... 1 33.331 73.331
## - MDVP.Shimmer.dB. 1 33.593 73.593
## - HNR 1 33.645 73.645
## - MDVP.APQ 1 33.742 73.742
## - NHR 1 33.880 73.880
## - MDVP.Shimmer 1 34.851 74.851
## - spread1 1 34.865 74.865
## <none> 32.925 74.925
## - Shimmer.APQ5 1 35.193 75.193
## - MDVP.Jitter.Abs. 1 35.840 75.840
## - DFA 1 44.698 84.698
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=72.94
## status ~ MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. +
## MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. +
## Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR +
## HNR + DFA + spread1 + spread2 + D2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - MDVP.RAP 1 32.979 70.979
## - Jitter.DDP 1 32.985 70.985
## - MDVP.Fhi.Hz. 1 33.007 71.007
## - spread2 1 33.014 71.014
## - D2 1 33.128 71.128
## - Shimmer.APQ3 1 33.159 71.159
## - Shimmer.DDA 1 33.180 71.180
## - MDVP.PPQ 1 33.226 71.226
## - MDVP.Flo.Hz. 1 33.242 71.242
## - MDVP.Jitter... 1 33.381 71.381
## - HNR 1 33.646 71.646
## - MDVP.Shimmer.dB. 1 33.707 71.707
## - MDVP.APQ 1 33.855 71.855
## - NHR 1 33.884 71.884
## <none> 32.936 72.936
## - MDVP.Shimmer 1 35.000 73.000
## - Shimmer.APQ5 1 35.433 73.433
## - MDVP.Jitter.Abs. 1 35.956 73.956
## - DFA 1 44.698 82.698
## - spread1 1 45.859 83.859
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=70.98
## status ~ MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. +
## MDVP.PPQ + Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. +
## Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR +
## HNR + DFA + spread1 + spread2 + D2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - MDVP.Fhi.Hz. 1 33.041 69.041
## - spread2 1 33.076 69.076
## - D2 1 33.243 69.243
## - MDVP.Flo.Hz. 1 33.298 69.298
## - MDVP.PPQ 1 33.341 69.341
## - Shimmer.APQ3 1 33.347 69.347
## - Shimmer.DDA 1 33.377 69.377
## - MDVP.Jitter... 1 33.456 69.456
## - HNR 1 33.654 69.654
## - NHR 1 33.885 69.885
## - MDVP.APQ 1 33.993 69.993
## - MDVP.Shimmer.dB. 1 34.275 70.275
## <none> 32.979 70.979
## - MDVP.Shimmer 1 35.070 71.070
## - Jitter.DDP 1 35.396 71.396
## - Shimmer.APQ5 1 35.534 71.534
## - MDVP.Jitter.Abs. 1 36.284 72.284
## - DFA 1 45.193 81.193
## - spread1 1 46.724 82.724
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=69.04
## status ~ MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ +
## Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 +
## Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + HNR + DFA +
## spread1 + spread2 + D2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - spread2 1 33.102 67.102
## - D2 1 33.334 67.334
## - MDVP.PPQ 1 33.349 67.349
## - MDVP.Flo.Hz. 1 33.365 67.365
## - Shimmer.APQ3 1 33.404 67.404
## - Shimmer.DDA 1 33.432 67.432
## - MDVP.Jitter... 1 33.545 67.545
## - HNR 1 33.832 67.832
## - MDVP.APQ 1 34.012 68.012
## - NHR 1 34.076 68.076
## - MDVP.Shimmer.dB. 1 34.828 68.828
## <none> 33.041 69.041
## - MDVP.Shimmer 1 35.109 69.109
## - Jitter.DDP 1 35.402 69.402
## - Shimmer.APQ5 1 35.606 69.606
## - MDVP.Jitter.Abs. 1 37.812 71.812
## - DFA 1 46.159 80.159
## - spread1 1 46.770 80.770
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=67.1
## status ~ MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ +
## Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 +
## Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + HNR + DFA +
## spread1 + D2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - D2 1 33.355 65.355
## - MDVP.PPQ 1 33.404 65.404
## - MDVP.Flo.Hz. 1 33.412 65.412
## - Shimmer.APQ3 1 33.461 65.461
## - Shimmer.DDA 1 33.487 65.487
## - MDVP.Jitter... 1 33.620 65.620
## - HNR 1 33.833 65.833
## - MDVP.APQ 1 34.015 66.015
## - NHR 1 34.078 66.078
## - MDVP.Shimmer.dB. 1 35.038 67.038
## <none> 33.102 67.102
## - MDVP.Shimmer 1 35.112 67.112
## - Shimmer.APQ5 1 35.634 67.634
## - Jitter.DDP 1 35.977 67.977
## - MDVP.Jitter.Abs. 1 37.880 69.880
## - DFA 1 46.248 78.248
## - spread1 1 52.354 84.354
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=65.36
## status ~ MDVP.Flo.Hz. + MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ +
## Jitter.DDP + MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 +
## Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA + NHR + HNR + DFA +
## spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - MDVP.Flo.Hz. 1 33.547 63.547
## - Shimmer.APQ3 1 33.665 63.665
## - Shimmer.DDA 1 33.691 63.691
## - MDVP.Jitter... 1 33.758 63.758
## - HNR 1 33.870 63.870
## - MDVP.PPQ 1 33.938 63.938
## - NHR 1 34.278 64.278
## - MDVP.APQ 1 34.794 64.794
## - MDVP.Shimmer.dB. 1 35.162 65.162
## <none> 33.355 65.355
## - Shimmer.APQ5 1 35.968 65.968
## - MDVP.Shimmer 1 36.001 66.001
## - Jitter.DDP 1 36.556 66.556
## - MDVP.Jitter.Abs. 1 38.321 68.321
## - DFA 1 46.626 76.626
## - spread1 1 57.434 87.434
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=63.55
## status ~ MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP +
## MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ3 + Shimmer.APQ5 +
## MDVP.APQ + Shimmer.DDA + NHR + HNR + DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - Shimmer.APQ3 1 33.808 61.808
## - Shimmer.DDA 1 33.832 61.832
## - HNR 1 33.874 61.874
## - MDVP.Jitter... 1 33.917 61.917
## - MDVP.PPQ 1 34.285 62.285
## - NHR 1 34.509 62.509
## - MDVP.APQ 1 35.110 63.110
## - MDVP.Shimmer.dB. 1 35.212 63.212
## <none> 33.547 63.547
## - Shimmer.APQ5 1 36.132 64.132
## - MDVP.Shimmer 1 36.196 64.196
## - Jitter.DDP 1 36.623 64.623
## - MDVP.Jitter.Abs. 1 38.505 66.505
## - DFA 1 46.722 74.722
## - spread1 1 58.958 86.958
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=61.81
## status ~ MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP +
## MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ +
## Shimmer.DDA + NHR + HNR + DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - HNR 1 34.153 60.153
## - MDVP.Jitter... 1 34.162 60.162
## - MDVP.PPQ 1 34.529 60.529
## - NHR 1 34.847 60.847
## - MDVP.Shimmer.dB. 1 35.405 61.405
## <none> 33.808 61.808
## - MDVP.APQ 1 36.069 62.069
## - Shimmer.APQ5 1 36.458 62.458
## - Jitter.DDP 1 36.776 62.776
## - MDVP.Shimmer 1 37.051 63.051
## - MDVP.Jitter.Abs. 1 38.543 64.543
## - Shimmer.DDA 1 39.258 65.258
## - DFA 1 46.722 72.722
## - spread1 1 58.961 84.961
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=60.15
## status ~ MDVP.Jitter... + MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP +
## MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ +
## Shimmer.DDA + NHR + DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - MDVP.Jitter... 1 34.488 58.488
## - MDVP.PPQ 1 35.022 59.022
## - NHR 1 35.114 59.114
## - MDVP.Shimmer.dB. 1 35.941 59.941
## <none> 34.153 60.153
## - Jitter.DDP 1 37.108 61.108
## - Shimmer.APQ5 1 37.142 61.142
## - MDVP.APQ 1 38.018 62.018
## - MDVP.Jitter.Abs. 1 38.629 62.629
## - MDVP.Shimmer 1 38.780 62.780
## - Shimmer.DDA 1 42.548 66.548
## - DFA 1 49.184 73.184
## - spread1 1 60.020 84.020
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=58.49
## status ~ MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer +
## MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA +
## NHR + DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - NHR 1 35.358 57.358
## - MDVP.Shimmer.dB. 1 35.998 57.998
## <none> 34.488 58.488
## - Shimmer.APQ5 1 37.619 59.619
## - Jitter.DDP 1 37.777 59.777
## - MDVP.APQ 1 38.050 60.050
## - MDVP.Shimmer 1 38.780 60.780
## - MDVP.PPQ 1 39.000 61.000
## - MDVP.Jitter.Abs. 1 39.272 61.272
## - Shimmer.DDA 1 42.561 64.561
## - DFA 1 49.426 71.426
## - spread1 1 60.366 82.366
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=57.36
## status ~ MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer +
## MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ + Shimmer.DDA +
## DFA + spread1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 35.358 57.358
## - MDVP.Shimmer.dB. 1 37.395 57.395
## - MDVP.APQ 1 38.509 58.509
## - Shimmer.APQ5 1 38.537 58.537
## - MDVP.Shimmer 1 39.054 59.054
## - MDVP.Jitter.Abs. 1 39.369 59.369
## - MDVP.PPQ 1 40.153 60.153
## - Jitter.DDP 1 40.348 60.348
## - Shimmer.DDA 1 42.827 62.827
## - DFA 1 50.922 70.922
## - spread1 1 61.944 81.944
##
## Call: glm(formula = status ~ MDVP.Jitter.Abs. + MDVP.PPQ + Jitter.DDP +
## MDVP.Shimmer + MDVP.Shimmer.dB. + Shimmer.APQ5 + MDVP.APQ +
## Shimmer.DDA + DFA + spread1, family = binomial, data = pd2,
## subset = train)
##
## Coefficients:
## (Intercept) MDVP.Jitter.Abs. MDVP.PPQ Jitter.DDP
## 3.101e+00 -1.755e+05 -3.916e+03 9.114e+02
## MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ5 MDVP.APQ
## 4.540e+03 7.531e+01 -1.346e+03 -1.218e+03
## Shimmer.DDA DFA spread1
## -1.851e+03 4.321e+01 5.612e+00
##
## Degrees of Freedom: 97 Total (i.e. Null); 87 Residual
## Null Deviance: 109.1
## Residual Deviance: 35.36 AIC: 57.36
At first glance, the p-values are all very large, except for DFA, which is smaller but still pretty large.
contrasts(pd2[,"status"])
## 1
## 0 0
## 1 1
glm_probs = predict(logist_reg_fit, pd2[test,], type="response")
head(glm_probs)
## phon_R01_S04_4 phon_R01_S22_6 phon_R01_S08_5 phon_R01_S05_1 phon_R01_S21_6
## 0.3258190 0.9997452 0.7062405 1.0000000 1.0000000
## phon_R01_S05_3
## 1.0000000
glm_pred=ifelse(glm_probs>0.5, 1, 0)
head(glm_pred)
## phon_R01_S04_4 phon_R01_S22_6 phon_R01_S08_5 phon_R01_S05_1 phon_R01_S21_6
## 0 1 1 1 1
## phon_R01_S05_3
## 1
length(glm_pred)
## [1] 97
length(y.test)
## [1] 97
table(glm_pred, y.test)
## y.test
## glm_pred 0 1
## 0 17 11
## 1 7 62
Classification Error
mean(glm_pred!=y.test)
## [1] 0.185567
Accuracy
mean(glm_pred==y.test)
## [1] 0.814433
Using Predictors Chosen By Lasso MDVP.Fo.Hz.+MDVP.Fhi.Hz+MDVP.APQ+DFA+spread1+spread2+D2
colnames(pd2)
## [1] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..."
## [5] "MDVP.Jitter.Abs." "MDVP.RAP" "MDVP.PPQ" "Jitter.DDP"
## [9] "MDVP.Shimmer" "MDVP.Shimmer.dB." "Shimmer.APQ3" "Shimmer.APQ5"
## [13] "MDVP.APQ" "Shimmer.DDA" "NHR" "HNR"
## [17] "RPDE" "DFA" "spread1" "spread2"
## [21] "D2" "PPE" "status"
logist_reg_fit2=glm(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=pd2, family=binomial, subset=train)
summary(logist_reg_fit)
##
## Call:
## glm(formula = status ~ ., family = binomial, data = pd2, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.25209 0.00000 0.00454 0.14252 1.50515
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.857e+01 4.922e+01 -0.377 0.7059
## MDVP.Fo.Hz. 3.087e-03 5.866e-02 0.053 0.9580
## MDVP.Fhi.Hz. 3.238e-03 1.513e-02 0.214 0.8305
## MDVP.Flo.Hz. -1.936e-02 3.836e-02 -0.505 0.6138
## MDVP.Jitter... -1.497e+03 2.542e+03 -0.589 0.5561
## MDVP.Jitter.Abs. -2.389e+05 2.527e+05 -0.945 0.3446
## MDVP.RAP -6.776e+04 2.884e+05 -0.235 0.8143
## MDVP.PPQ -1.864e+03 3.768e+03 -0.495 0.6207
## Jitter.DDP 2.365e+04 9.598e+04 0.246 0.8053
## MDVP.Shimmer 5.099e+03 4.432e+03 1.151 0.2499
## MDVP.Shimmer.dB. 8.412e+01 1.231e+02 0.683 0.4944
## Shimmer.APQ3 1.352e+05 3.108e+05 0.435 0.6636
## Shimmer.APQ5 -2.056e+03 1.710e+03 -1.202 0.2292
## MDVP.APQ -1.068e+03 1.249e+03 -0.855 0.3925
## Shimmer.DDA -4.701e+04 1.037e+05 -0.453 0.6502
## NHR 8.646e+01 1.023e+02 0.845 0.3982
## HNR 4.297e-01 5.681e-01 0.756 0.4494
## RPDE 4.853e-01 9.039e+00 0.054 0.9572
## DFA 5.860e+01 2.999e+01 1.954 0.0507 .
## spread1 6.140e+00 6.336e+00 0.969 0.3326
## spread2 -3.070e+00 1.360e+01 -0.226 0.8214
## D2 9.834e-01 2.602e+00 0.378 0.7055
## PPE 5.662e+00 6.918e+01 0.082 0.9348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 109.105 on 97 degrees of freedom
## Residual deviance: 32.917 on 75 degrees of freedom
## AIC: 78.917
##
## Number of Fisher Scoring iterations: 11
glm_probs = predict(logist_reg_fit2, pd2[test,], type="response")
head(glm_probs)
## phon_R01_S04_4 phon_R01_S22_6 phon_R01_S08_5 phon_R01_S05_1 phon_R01_S21_6
## 0.6355746 0.8711462 0.7096464 0.9999748 0.9998081
## phon_R01_S05_3
## 0.9992184
glm_pred=ifelse(glm_probs>0.5, 1, 0)
head(glm_pred)
## phon_R01_S04_4 phon_R01_S22_6 phon_R01_S08_5 phon_R01_S05_1 phon_R01_S21_6
## 1 1 1 1 1
## phon_R01_S05_3
## 1
table(glm_pred, y.test)
## y.test
## glm_pred 0 1
## 0 15 8
## 1 9 65
"Error"
## [1] "Error"
mean(glm_pred!=y.test)
## [1] 0.1752577
"Accuracy"
## [1] "Accuracy"
mean(glm_pred==y.test)
## [1] 0.8247423
The model improves
lda_fit = lda(status~., data=pd2[,-5], subset=train)
lda_fit
## Call:
## lda(status ~ ., data = pd2[, -5], subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.244898 0.755102
##
## Group means:
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter... MDVP.RAP MDVP.PPQ
## 0 180.8565 209.1609 136.2219 0.004001667 0.001983750 0.002148750
## 1 142.4056 189.3842 108.2484 0.007763919 0.004226351 0.004345676
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ
## 0 0.0059525 0.01721833 0.1580000 0.009189167 0.01038167 0.01322583
## 1 0.0126800 0.03571662 0.3466757 0.018682162 0.02135635 0.02978635
## Shimmer.DDA NHR HNR RPDE DFA spread1 spread2
## 0 0.02756625 0.01295083 24.76229 0.4501494 0.6909354 -6.676518 0.1636713
## 1 0.05604608 0.03406108 20.75650 0.5152109 0.7310390 -5.278859 0.2474796
## D2 PPE
## 0 2.132207 0.1312950
## 1 2.432709 0.2400174
##
## Coefficients of linear discriminants:
## LD1
## MDVP.Fo.Hz. -3.728117e-03
## MDVP.Fhi.Hz. 6.644095e-04
## MDVP.Flo.Hz. -6.176427e-03
## MDVP.Jitter... -4.139984e+02
## MDVP.RAP -7.502706e+03
## MDVP.PPQ -7.046188e+02
## Jitter.DDP 2.817771e+03
## MDVP.Shimmer 1.670053e+02
## MDVP.Shimmer.dB. 8.091584e+00
## Shimmer.APQ3 -4.016317e+04
## Shimmer.APQ5 -9.464667e+01
## MDVP.APQ -3.623173e+01
## Shimmer.DDA 1.331237e+04
## NHR -3.928908e-01
## HNR 2.446249e-02
## RPDE -2.670761e+00
## DFA 9.791429e+00
## spread1 1.458082e+00
## spread2 1.285339e+00
## D2 4.694775e-01
## PPE -1.465356e+00
plot(lda_fit) #produces plots of the linear discriminants
lda_pred = predict(lda_fit, pd2[test,])
names(lda_pred) #the names are class, contains LDA’s predictions
## [1] "class" "posterior" "x"
lda_class = lda_pred$class #the class predictions of our fitted model on test data
table(lda_class, y.test) #predictions vs real values of test data
## y.test
## lda_class 0 1
## 0 17 7
## 1 7 66
contrasts(pd2[,"status"]) #the one with value=1 is the one for which we are getting the posterior probability
## 1
## 0 0
## 1 1
"Accuracy"
## [1] "Accuracy"
mean(lda_class==y.test)
## [1] 0.8556701
"Classification Error"
## [1] "Classification Error"
mean(lda_class!=y.test)
## [1] 0.1443299
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100
sum(lda_pred$posterior[,1]>=0.5) #observations with predicted proba >0.5
## [1] 24
sum(lda_pred$posterior[,1]<0.5)
## [1] 73
#MY OWN POSTERIOR PROBABILITY THRESHOLD
#sum(lda_pred$posterior[,1]>0.9) #observations with predicted proba >0.9
#my_threshlold_classes=ifelse(lda_pred$posterior[,1]>0.9, "yes", "no") #yes being the one with contrast=1
#CV for LDA
k=10 #number of folds
Dataset=pd2
set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
## 1 2 3 4 5 6 7 8 9 10
## 19 22 20 13 16 12 25 22 18 28
error=vector()#error for each fold
for (j in 1:k) #j refers to folds
{
lda_fit = lda(status~., data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold
lda_pred = predict(lda_fit, Dataset[folds==j,]) #test data, current fold
lda_class = lda_pred$class #the class predictions of our fitted model on test data
error=append(error, mean(lda_class!=Dataset[folds==j, "status"])) #cv error
}
cv_error=mean(error)
cv_error
## [1] 0.1172264
Using Predictors Selected by Lasso status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2
lda_fit = lda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=pd2[,-5], subset=train)
lda_fit
## Call:
## lda(status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.APQ + DFA + spread1 +
## spread2 + D2, data = pd2[, -5], subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.244898 0.755102
##
## Group means:
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.APQ DFA spread1 spread2 D2
## 0 180.8565 209.1609 0.01322583 0.6909354 -6.676518 0.1636713 2.132207
## 1 142.4056 189.3842 0.02978635 0.7310390 -5.278859 0.2474796 2.432709
##
## Coefficients of linear discriminants:
## LD1
## MDVP.Fo.Hz. -0.0155899826
## MDVP.Fhi.Hz. 0.0005662592
## MDVP.APQ -0.3739268589
## DFA 5.6755575641
## spread1 0.4777103752
## spread2 0.5062658518
## D2 1.2190753822
plot(lda_fit) #produces plots of the linear discriminants
lda_pred = predict(lda_fit, pd2[test,])
names(lda_pred) #the names are class, contains LDA’s predictions
## [1] "class" "posterior" "x"
lda_class = lda_pred$class #the class predictions of our fitted model on test data
table(lda_class, y.test) #predictions vs real values of test data
## y.test
## lda_class 0 1
## 0 14 6
## 1 10 67
contrasts(pd2[,"status"]) #the one with value=1 is the one for which we are getting the posterior probability
## 1
## 0 0
## 1 1
"Accuracy"
## [1] "Accuracy"
mean(lda_class==y.test)
## [1] 0.8350515
"Classification Error"
## [1] "Classification Error"
mean(lda_class!=y.test)
## [1] 0.1649485
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100
sum(lda_pred$posterior[,1]>=0.5) #observations with predicted proba >0.5
## [1] 20
sum(lda_pred$posterior[,1]<0.5)
## [1] 77
#MY OWN POSTERIOR PROBABILITY THRESHOLD
#sum(lda_pred$posterior[,1]>0.9) #observations with predicted proba >0.9
#my_threshlold_classes=ifelse(lda_pred$posterior[,1]>0.9, "yes", "no") #yes being the one with contrast=1
#CV for LDA
k=10 #number of folds
Dataset=pd2
set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
## 1 2 3 4 5 6 7 8 9 10
## 19 22 20 13 16 12 25 22 18 28
error=vector()#error for each fold
for (j in 1:k) #j refers to folds
{
lda_fit = lda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold
lda_pred = predict(lda_fit, Dataset[folds==j,]) #test data, current fold
lda_class = lda_pred$class #the class predictions of our fitted model on test data
error=append(error, mean(lda_class!=Dataset[folds==j, "status"])) #cv error
}
cv_error=mean(error)
cv_error
## [1] 0.1498723
qda_fit=qda(status~., data=pd2, subset = train)
qda_fit
## Call:
## qda(status ~ ., data = pd2, subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.244898 0.755102
##
## Group means:
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter... MDVP.Jitter.Abs.
## 0 180.8565 209.1609 136.2219 0.004001667 2.516667e-05
## 1 142.4056 189.3842 108.2484 0.007763919 5.581081e-05
## MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## 0 0.001983750 0.002148750 0.0059525 0.01721833 0.1580000 0.009189167
## 1 0.004226351 0.004345676 0.0126800 0.03571662 0.3466757 0.018682162
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR RPDE DFA
## 0 0.01038167 0.01322583 0.02756625 0.01295083 24.76229 0.4501494 0.6909354
## 1 0.02135635 0.02978635 0.05604608 0.03406108 20.75650 0.5152109 0.7310390
## spread1 spread2 D2 PPE
## 0 -6.676518 0.1636713 2.132207 0.1312950
## 1 -5.278859 0.2474796 2.432709 0.2400174
qda_class=predict(qda_fit, pd2[test,])$class
table(qda_class, y.test)
## y.test
## qda_class 0 1
## 0 7 1
## 1 17 72
"Accuracy"
## [1] "Accuracy"
mean(qda_class==y.test)
## [1] 0.814433
"Classification Error"
## [1] "Classification Error"
mean(qda_class!=y.test)
## [1] 0.185567
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100
Using the validation set approach, LDA performs best
#CV for QDA
k=10 #number of folds
Dataset=pd2
set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
## 1 2 3 4 5 6 7 8 9 10
## 19 22 20 13 16 12 25 22 18 28
error=vector()#error for each fold
for (j in 1:k) #j refers to folds
{
qda_fit = qda(status~., data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold
qda_class=predict(qda_fit, Dataset[folds==j,])$class #test data, current fold
error=append(error, mean(qda_class!=Dataset[folds==j, "status"])) #cv error
}
cv_error=mean(error)
cv_error
## [1] 0.1312084
Using Predictors Selected by Lasso
qda_fit=qda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=pd2, subset = train)
qda_fit
## Call:
## qda(status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.APQ + DFA + spread1 +
## spread2 + D2, data = pd2, subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.244898 0.755102
##
## Group means:
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.APQ DFA spread1 spread2 D2
## 0 180.8565 209.1609 0.01322583 0.6909354 -6.676518 0.1636713 2.132207
## 1 142.4056 189.3842 0.02978635 0.7310390 -5.278859 0.2474796 2.432709
qda_class=predict(qda_fit, pd2[test,])$class
table(qda_class, y.test)
## y.test
## qda_class 0 1
## 0 19 13
## 1 5 60
"Accuracy"
## [1] "Accuracy"
mean(qda_class==y.test)
## [1] 0.814433
"Classification Error"
## [1] "Classification Error"
mean(qda_class!=y.test)
## [1] 0.185567
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100
#CV for QDA
k=10 #number of folds
Dataset=pd2
set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
## 1 2 3 4 5 6 7 8 9 10
## 19 22 20 13 16 12 25 22 18 28
error=vector()#error for each fold
for (j in 1:k) #j refers to folds
{
qda_fit = qda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold
qda_class=predict(qda_fit, Dataset[folds==j,])$class #test data, current fold
error=append(error, mean(qda_class!=Dataset[folds==j, "status"])) #cv error
}
cv_error=mean(error)
cv_error
## [1] 0.1384934
tree.pd=tree(pd[,"status"]~., pd2[,1:22], subset=train)
summary(tree.pd)
##
## Classification tree:
## tree(formula = pd[, "status"] ~ ., data = pd2[, 1:22], subset = train)
## Variables actually used in tree construction:
## [1] "spread1" "MDVP.Fhi.Hz." "MDVP.APQ" "MDVP.Flo.Hz."
## Number of terminal nodes: 7
## Residual mean deviance: 0.2173 = 19.78 / 91
## Misclassification error rate: 0.06122 = 6 / 98
plot(tree.pd)
text(tree.pd, pretty=0)
Spread1 seems to be the most important predictor here.
tree.pred=predict(tree.pd, pd2[test,], type="class")
table(tree.pred, y.test)
## y.test
## tree.pred 0 1
## 0 14 6
## 1 10 67
"Accuracy"
## [1] "Accuracy"
mean(tree.pred==y.test)
## [1] 0.8350515
"Error"
## [1] "Error"
mean(tree.pred!=y.test)
## [1] 0.1649485
The test error rate seems good. We consider pruning the tree.
set.seed(4706)
cv.pd=cv.tree(tree.pd, FUN=prune.misclass)
names(cv.pd)
## [1] "size" "dev" "k" "method"
cv.pd
## $size
## [1] 7 6 2 1
##
## $dev
## [1] 20 19 15 26
##
## $k
## [1] -Inf 0.00 1.25 13.00
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
$dev is the CV error rate. It seems to be lowest for tree size 2.
par(mfrow=c(1,2))
plot(cv.pd$size, cv.pd$dev, type="b", ylab="cv error")
plot(cv.pd$k, cv.pd$dev, type="b", ylab = "cv error")
Prune the tree and obtain the 2-node tree
pruned.pd=prune.misclass(tree.pd, best=2)
plot(pruned.pd)
text(pruned.pd, pretty=0)
pruned.pred=predict(pruned.pd, pd2[test,], type="class")
table(pruned.pred, y.test)
## y.test
## pruned.pred 0 1
## 0 14 6
## 1 10 67
"Accuracy"
## [1] "Accuracy"
mean(pruned.pred==y.test)
## [1] 0.8350515
"Error"
## [1] "Error"
mean(pruned.pred!=y.test)
## [1] 0.1649485
The unpruned tree actually performed better than the pruned tree.
BAGGING
#bagging
set.seed(4706)
bag.pd=randomForest(pd2[,"status"]~., pd2[,1:22], subset=train, mtry=13, importance=TRUE)
#mtry: Number of variables randomly sampled as candidates at each split. Note that the default values are different for classification (sqrt(p) where p is number of variables in x) and regression (p/3)
bag.pd
##
## Call:
## randomForest(formula = pd2[, "status"] ~ ., data = pd2[, 1:22], mtry = 13, importance = TRUE, subset = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 13
##
## OOB estimate of error rate: 13.27%
## Confusion matrix:
## 0 1 class.error
## 0 15 9 0.37500000
## 1 4 70 0.05405405
plot(bag.pd)
yhat.bag=predict(bag.pd, newdata = pd2[test,])
plot(yhat.bag, y.test, xlab="bagging pred", ylab="actual")
"Error"
## [1] "Error"
mean(yhat.bag!=y.test)
## [1] 0.07216495
"Accuracy"
## [1] "Accuracy"
mean(yhat.bag==y.test)
## [1] 0.9278351
The bagged tree shows the highest accuracy so far, but the OOB estimate of error rate is 13.27%
RANDOM FOREST
set.seed(4706)
rf.pd=randomForest(pd2[,"status"]~., pd2[,1:22], subset=train, mtry=6, importance=TRUE)
yhat.rf=predict(rf.pd, newdata = pd2[test,])
rf.pd
##
## Call:
## randomForest(formula = pd2[, "status"] ~ ., data = pd2[, 1:22], mtry = 6, importance = TRUE, subset = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 13.27%
## Confusion matrix:
## 0 1 class.error
## 0 15 9 0.37500000
## 1 4 70 0.05405405
plot(rf.pd)
"Error"
## [1] "Error"
mean(yhat.rf!=y.test)
## [1] 0.08247423
"Accuracy"
## [1] "Accuracy"
mean(yhat.rf==y.test)
## [1] 0.9175258
Random Forest performed almost as well as bagging
importance(rf.pd)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## MDVP.Fo.Hz. 11.4448187 11.104339 14.038699 4.5970648
## MDVP.Fhi.Hz. 6.6755381 4.530633 7.030507 1.7144558
## MDVP.Flo.Hz. 3.3536820 1.920882 3.439286 1.0143408
## MDVP.Jitter... -1.0025620 4.510267 3.801288 0.9789375
## MDVP.Jitter.Abs. 2.1330052 4.343872 4.835862 1.1343849
## MDVP.RAP 0.4450338 5.115567 4.859047 1.0574212
## MDVP.PPQ -0.6501283 4.160240 3.312010 1.1718007
## Jitter.DDP 0.2283893 6.492795 6.314745 1.0522543
## MDVP.Shimmer 1.4712924 7.059461 6.899473 0.9189805
## MDVP.Shimmer.dB. 0.6454145 7.370814 7.265857 0.8806220
## Shimmer.APQ3 -1.1320049 4.214220 3.730090 0.7356575
## Shimmer.APQ5 -1.5924925 6.229775 5.108339 1.1045711
## MDVP.APQ 5.1574296 8.979118 10.048115 2.3779397
## Shimmer.DDA -1.6055500 5.965143 4.968150 0.6076920
## NHR 1.2888463 2.548764 2.943565 1.4794231
## HNR -0.3993738 3.070284 2.072825 0.7257339
## RPDE -3.3861256 4.774894 2.629008 0.9505619
## DFA 2.5724624 4.559135 4.793670 1.3004174
## spread1 10.1666590 13.757678 14.840937 5.8751827
## spread2 5.5245157 6.734957 7.791857 1.6422095
## D2 3.9574381 2.227187 3.997375 1.0211105
## PPE 6.9094008 10.733483 11.188341 3.6558096
library(gbm)
## Loaded gbm 2.1.8
#gbm() with the option distribution="gaussian" since this is a regression problem; if it were a bi- nary classification problem, we would use distribution="bernoulli". The argument n.trees=5000 indicates that we want 5000 trees, and the option interaction.depth=4 limits the depth of each tree
set.seed(4706)
boost.pd=gbm(pd2[train,"status"]~., pd2[train,1:22], distribution = "gaussian", n.trees=5000, interaction.depth=4)
#shrinkage=0.001 by default
#can change it
summary(boost.pd)
## var rel.inf
## spread1 spread1 20.18000535
## MDVP.Fo.Hz. MDVP.Fo.Hz. 15.50987695
## spread2 spread2 9.47971138
## DFA DFA 6.80467690
## MDVP.Fhi.Hz. MDVP.Fhi.Hz. 6.25492044
## MDVP.APQ MDVP.APQ 5.82610604
## D2 D2 4.17080580
## MDVP.Shimmer.dB. MDVP.Shimmer.dB. 3.79637001
## MDVP.RAP MDVP.RAP 3.39939800
## Shimmer.APQ5 Shimmer.APQ5 3.29340672
## RPDE RPDE 2.78656052
## NHR NHR 2.45493065
## MDVP.PPQ MDVP.PPQ 2.21570361
## MDVP.Jitter... MDVP.Jitter... 2.21299478
## MDVP.Shimmer MDVP.Shimmer 2.17943916
## MDVP.Flo.Hz. MDVP.Flo.Hz. 2.17164003
## MDVP.Jitter.Abs. MDVP.Jitter.Abs. 1.91427694
## PPE PPE 1.88288846
## HNR HNR 1.68732825
## Shimmer.APQ3 Shimmer.APQ3 1.64984109
## Jitter.DDP Jitter.DDP 0.11166319
## Shimmer.DDA Shimmer.DDA 0.01745572
par(mfrow=c(1,2))
plot(boost.pd, i="spread1")
plot(boost.pd, i="MDVP.Fo.Hz.")
pdavg$status=as.factor(pdavg$status)
summary(pdavg)
## Group.1 MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## Length:32 Min. : 97.94 Min. :107.8 Min. : 66.23
## Class :character 1st Qu.:118.12 1st Qu.:140.4 1st Qu.: 93.42
## Mode :character Median :150.10 Median :197.2 Median :104.58
## Mean :153.86 Mean :195.6 Mean :116.27
## 3rd Qu.:183.74 3rd Qu.:226.5 3rd Qu.:128.35
## Max. :243.81 Max. :393.9 Max. :222.12
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ
## Min. :0.002147 Min. :9.167e-06 Min. :0.000925 Min. :0.001133
## 1st Qu.:0.003864 1st Qu.:2.500e-05 1st Qu.:0.001782 1st Qu.:0.002066
## Median :0.004832 Median :3.536e-05 Median :0.002426 Median :0.002828
## Mean :0.006156 Mean :4.375e-05 Mean :0.003271 Mean :0.003407
## 3rd Qu.:0.006831 3rd Qu.:5.208e-05 3rd Qu.:0.003833 3rd Qu.:0.003731
## Max. :0.020787 Max. :1.600e-04 Max. :0.012718 Max. :0.012237
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## Min. :0.002780 Min. :0.01080 Min. :0.09567 Min. :0.005383
## 1st Qu.:0.005348 1st Qu.:0.01700 1st Qu.:0.16094 1st Qu.:0.008970
## Median :0.007278 Median :0.02355 Median :0.21708 Median :0.013267
## Mean :0.009812 Mean :0.02941 Mean :0.27874 Mean :0.015527
## 3rd Qu.:0.011500 3rd Qu.:0.03656 3rd Qu.:0.33029 3rd Qu.:0.019817
## Max. :0.038157 Max. :0.07843 Max. :0.87114 Max. :0.037554
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## Min. :0.006758 Min. :0.008195 Min. :0.01615 Min. :0.001495
## 1st Qu.:0.010235 1st Qu.:0.013479 1st Qu.:0.02692 1st Qu.:0.005696
## Median :0.013642 Median :0.018527 Median :0.03979 Median :0.012793
## Mean :0.017677 Mean :0.023763 Mean :0.04658 Mean :0.024273
## 3rd Qu.:0.020677 3rd Qu.:0.028656 3rd Qu.:0.05945 3rd Qu.:0.025694
## Max. :0.050831 Max. :0.081151 Max. :0.11266 Max. :0.174122
## HNR status RPDE DFA spread1
## Min. :12.06 0: 8 Min. :0.3275 Min. :0.6383 Min. :-7.590
## 1st Qu.:19.60 1:24 1st Qu.:0.4281 1st Qu.:0.6689 1st Qu.:-6.275
## Median :22.17 Median :0.4858 Median :0.7200 Median :-5.630
## Mean :21.96 Mean :0.4981 Mean :0.7181 Mean :-5.699
## 3rd Qu.:24.81 3rd Qu.:0.5890 3rd Qu.:0.7628 3rd Qu.:-5.173
## Max. :30.99 Max. :0.6384 Max. :0.8213 Max. :-3.657
## spread2 D2 PPE
## Min. :0.05518 Min. :1.796 Min. :0.06811
## 1st Qu.:0.17310 1st Qu.:2.193 1st Qu.:0.15685
## Median :0.22778 Median :2.324 Median :0.20538
## Mean :0.22475 Mean :2.372 Mean :0.20532
## 3rd Qu.:0.28154 3rd Qu.:2.536 3rd Qu.:0.23381
## Max. :0.35930 Max. :3.141 Max. :0.39514
pairs(pdavg[,-1], col=pdavg$status)
#red=parkinson's
plot(pdavg$PPE, pdavg$spread1, col=pdavg$status)
plot(pdavg$PPE, pdavg$spread1, col=pdavg$status)
pd2avg=pdavg[,-1]
rownames(pd2avg) = pdavg[,1]
pd2avg=pd2avg[,-17]
par(mfrow=c(3,3))
for (i in colnames(pd2avg))
{hist(pd2avg[,i], main=i)}
par(mfrow=c(1,3))
for (i in colnames(pd2avg))
{boxplot(pd2avg[,i], main=i)}
Fewer outliers after averaging
outl=boxplot.stats(pd2avg[,"MDVP.Flo.Hz."])$out
ind=which(pd2avg[,"MDVP.Flo.Hz."] %in% c(outl))
pd2avg[c(ind),]
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## phon_R01_S07 200.2668 210.8843 194.3662 0.002163333
## phon_R01_S10 243.8143 254.2805 222.1150 0.002390000
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer
## phon_R01_S07 9.666667e-06 0.001175 0.001281667 0.003523333 0.01080333
## phon_R01_S10 9.166667e-06 0.001285 0.001486667 0.003850000 0.01530833
## MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA
## phon_R01_S07 0.09566667 0.005383333 0.006870000 0.00819500 0.01614833
## phon_R01_S10 0.13700000 0.008641667 0.009243333 0.01073333 0.02592167
## NHR HNR RPDE DFA spread1 spread2
## phon_R01_S07 0.001495000 30.99217 0.3955782 0.7414823 -7.589537 0.1730488
## phon_R01_S10 0.005421667 24.61467 0.4517002 0.6382515 -7.105562 0.1298533
## D2 PPE
## phon_R01_S07 1.795701 0.06811333
## phon_R01_S10 2.298464 0.09839017
pdavg[c(ind),]
## Group.1 MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter...
## 6 phon_R01_S07 200.2668 210.8843 194.3662 0.002163333
## 8 phon_R01_S10 243.8143 254.2805 222.1150 0.002390000
## MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer
## 6 9.666667e-06 0.001175 0.001281667 0.003523333 0.01080333
## 8 9.166667e-06 0.001285 0.001486667 0.003850000 0.01530833
## MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## 6 0.09566667 0.005383333 0.006870000 0.00819500 0.01614833 0.001495000
## 8 0.13700000 0.008641667 0.009243333 0.01073333 0.02592167 0.005421667
## HNR status RPDE DFA spread1 spread2 D2 PPE
## 6 30.99217 0 0.3955782 0.7414823 -7.589537 0.1730488 1.795701 0.06811333
## 8 24.61467 0 0.4517002 0.6382515 -7.105562 0.1298533 2.298464 0.09839017
Both healthy, not going to remove (will lost 2 out of 8)
pca_pdavg=prcomp(pd2avg, scale=TRUE)
str(pca_pdavg)
## List of 5
## $ sdev : num [1:22] 3.686 1.678 1.261 1.174 0.955 ...
## $ rotation: num [1:22, 1:22] 0.0613 0.00712 0.07747 -0.25043 -0.23769 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
## .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:22] 1.54e+02 1.96e+02 1.16e+02 6.16e-03 4.37e-05 ...
## ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
## $ scale : Named num [1:22] 4.09e+01 6.28e+01 3.76e+01 4.26e-03 3.12e-05 ...
## ..- attr(*, "names")= chr [1:22] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..." ...
## $ x : num [1:32, 1:22] -4.493 0.505 1.423 -2.501 1.449 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "phon_R01_S01" "phon_R01_S02" "phon_R01_S04" "phon_R01_S05" ...
## .. ..$ : chr [1:22] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
ggbiplot(pca_pdavg)
biplot(pca_pdavg, scale=0)
ggbiplot(pca_pdavg, labels=rownames(pd2avg))
disease_stat=ifelse(pdavg$status=="1", "PD", "HLT")
ggbiplot(pca_pdavg,ellipse=TRUE, groups=disease_stat)
Clustering seems to make a bit more sense here
head(pca_pdavg$rotation)
## PC1 PC2 PC3 PC4 PC5
## MDVP.Fo.Hz. 0.061299358 0.54841845 -0.04272008 0.1334530 -0.01731036
## MDVP.Fhi.Hz. 0.007124914 0.42368002 0.29295524 -0.1465257 -0.38838306
## MDVP.Flo.Hz. 0.077473404 0.36946891 -0.39066813 0.1664252 -0.08125467
## MDVP.Jitter... -0.250434358 0.06358702 -0.10210604 -0.2735671 -0.09124493
## MDVP.Jitter.Abs. -0.237685724 -0.11220925 -0.08416312 -0.3426294 -0.02942348
## MDVP.RAP -0.245194803 0.09518996 -0.11760315 -0.2968125 -0.02973169
## PC6 PC7 PC8 PC9 PC10
## MDVP.Fo.Hz. 0.06451036 0.10129636 0.250782418 0.17629433 -0.38212957
## MDVP.Fhi.Hz. -0.31046652 0.53176730 -0.238047987 0.11492100 0.12392285
## MDVP.Flo.Hz. 0.68823690 0.09051954 -0.022434181 -0.12629583 0.08220497
## MDVP.Jitter... 0.02412989 -0.03705645 -0.001180571 0.05092826 0.01474532
## MDVP.Jitter.Abs. 0.06793527 -0.04169890 -0.045283585 -0.06316520 -0.23387602
## MDVP.RAP 0.05694397 -0.09228001 0.022900453 0.08702280 0.02192711
## PC11 PC12 PC13 PC14 PC15
## MDVP.Fo.Hz. -0.4772299 0.04323745 -0.37155927 0.070051482 -0.006036326
## MDVP.Fhi.Hz. 0.2795802 0.09946360 0.06378219 -0.055530710 -0.013558098
## MDVP.Flo.Hz. 0.3619906 -0.01154541 0.15208114 -0.077901298 -0.052741319
## MDVP.Jitter... -0.2084222 0.08015262 0.11618340 -0.003004445 -0.183885581
## MDVP.Jitter.Abs. 0.1904564 0.20951570 0.01657499 0.334014047 -0.448111892
## MDVP.RAP -0.1438669 0.16760632 0.08978935 -0.202068305 0.199174295
## PC16 PC17 PC18 PC19 PC20
## MDVP.Fo.Hz. 0.152753484 -0.14410026 0.00301326 -0.055165936 -0.036871357
## MDVP.Fhi.Hz. -0.016256399 -0.03462657 0.03166574 0.007141266 0.001807813
## MDVP.Flo.Hz. -0.006152586 0.02540490 0.01298393 0.036460549 0.012778431
## MDVP.Jitter... 0.341058852 0.47759418 0.20442131 0.590944110 0.047692444
## MDVP.Jitter.Abs. 0.188485433 -0.49274449 0.17221276 -0.120652132 -0.152067569
## MDVP.RAP -0.376085648 -0.15122816 -0.06181958 0.038611573 -0.017555511
## PC21 PC22
## MDVP.Fo.Hz. 1.658498e-05 -7.976223e-05
## MDVP.Fhi.Hz. -2.347411e-05 -1.093318e-05
## MDVP.Flo.Hz. -5.062547e-06 4.774867e-06
## MDVP.Jitter... 6.122519e-04 -2.340494e-04
## MDVP.Jitter.Abs. 6.404405e-04 -4.669768e-04
## MDVP.RAP 7.050823e-01 5.695548e-02
Proprotion of Variance Explained by PC1
pr.var=pca_pdavg$sdev^2
pve=pr.var/sum(pr.var)
par(mfrow=c(1,2))
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained ", ylim=c(0,1),type="b")
plot(cumsum(pve), xlab="Principal Component ", ylab=" Cumulative Proportion of Variance Explained ", ylim=c(0,1), type="b")
PC1 explains 61.8% of the variance
Clustering with K=2
km.pd2avg=kmeans(pd2avg,2,nstart=20)
head(km.pd2avg$cluster)
## phon_R01_S01 phon_R01_S02 phon_R01_S04 phon_R01_S05 phon_R01_S06 phon_R01_S07
## 1 1 1 1 1 2
head(pdavg$status)
## [1] 1 1 1 1 1 0
## Levels: 0 1
s=ifelse(pdavg$status==1, 1, 2) #so the colors match
head(s)
## [1] 1 1 1 1 1 2
par(mfrow=c(1,2))
for (i in colnames(pd2avg))
{plot(pd2avg[,i], col=(km.pd2avg$cluster), main="K-Means Clustering; K=2", xlab="", ylab=i, pch=20, cex=2)
plot(pd2avg[,i], col=(s), main="By Status; black=PD", xlab="", ylab=i, pch=20, cex=2)}
Clustering seems to make a lot more sense now. General trends of the 2 clusters look similar to the actual response classes.
Since there are only 8 healthy individuals, I make sure to include a sufficient number in the training and test sets, so I split them into healthy vs diseased, sample from each, and then combine them into training and test sets.
pd2avg=cbind(pd2avg, pdavg[,"status"])
colnames(pd2avg)=c("MDVP.Fo.Hz.", "MDVP.Fhi.Hz." , "MDVP.Flo.Hz." , "MDVP.Jitter..." , "MDVP.Jitter.Abs." ,"MDVP.RAP" , "MDVP.PPQ" , "Jitter.DDP" , "MDVP.Shimmer" , "MDVP.Shimmer.dB." ,"Shimmer.APQ3" , "Shimmer.APQ5" ,
"MDVP.APQ" , "Shimmer.DDA" , "NHR" , "HNR" , "RPDE" , "DFA" ,
"spread1" , "spread2" , "D2" , "PPE" , "status")
pd_dis_idx=which(pd2avg$status==1) #indeces of PD individuals
pd_health_idx=which(pd2avg$status==0) #indeces of healthy individuals
#length(pd_health_idx)
#length(pd_dis_idx)
set.seed(4706)
#length(pd_dis_idx)
#length(pd_health_idx)
#total: 24 diseased, 8 healthy
#12 diseased test
#4 healthy test
#12 diseased train
#4 healthy train
test_health=sample(pd_health_idx, 4, replace = FALSE) #sample from indeces
#test_health
test_dis=sample(pd_dis_idx, 12, replace = FALSE) #sample from indeces
#test_dis
testavg=append(test_dis, test_health) #test set
tot=1:nrow(pd2avg)
trainavg=tot[-testavg] #training set
#trainavg
#testavg
y.trainavg=pd2avg[trainavg, "status"]
y.testavg=pd2avg[testavg, "status"]
y.trainavg
## [1] 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0
## Levels: 0 1
y.testavg
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
## Levels: 0 1
#trainavg
#testavg
regfit_fullavg=regsubsets(status~., data=pd2avg, subset=train, nvmax=23) #nvmax=p
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 7 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 15
## Warning in leaps.exhaustive(a, really.big): XHAUST returned error code -999
reg_full_sumavg=summary(regfit_fullavg) #returns R^2, adjusted R^2, AIC, BIC, Cp
reg_full_sumavg
## Subset selection object
## Call: regsubsets.formula(status ~ ., data = pd2avg, subset = train,
## nvmax = 23)
## 22 Variables (and intercept)
## Forced in Forced out
## MDVP.Fo.Hz. FALSE FALSE
## MDVP.Fhi.Hz. FALSE FALSE
## MDVP.Flo.Hz. FALSE FALSE
## MDVP.Jitter... FALSE FALSE
## MDVP.Jitter.Abs. FALSE FALSE
## MDVP.RAP FALSE FALSE
## MDVP.PPQ FALSE FALSE
## Jitter.DDP FALSE FALSE
## MDVP.Shimmer FALSE FALSE
## MDVP.Shimmer.dB. FALSE FALSE
## Shimmer.APQ3 FALSE FALSE
## Shimmer.APQ5 FALSE FALSE
## MDVP.APQ FALSE FALSE
## Shimmer.DDA FALSE FALSE
## NHR FALSE FALSE
## HNR FALSE FALSE
## RPDE FALSE FALSE
## DFA FALSE FALSE
## spread1 FALSE FALSE
## spread2 FALSE FALSE
## D2 FALSE FALSE
## PPE FALSE FALSE
## 1 subsets of each size up to 15
## Selection Algorithm: exhaustive
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter... MDVP.Jitter.Abs.
## 1 ( 1 ) "*" " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " "
## 4 ( 1 ) "*" "*" "*" "*" " "
## 5 ( 1 ) "*" "*" "*" "*" "*"
## 6 ( 1 ) "*" "*" "*" "*" "*"
## 7 ( 1 ) "*" "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" "*"
## MDVP.RAP MDVP.PPQ Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB.
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " "
## 7 ( 1 ) "*" "*" " " " " " "
## 8 ( 1 ) "*" "*" "*" " " " "
## 9 ( 1 ) "*" "*" "*" "*" " "
## 10 ( 1 ) "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" "*"
## Shimmer.APQ3 Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR HNR RPDE DFA
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " " " " " "
## 9 ( 1 ) " " " " " " " " " " " " " " " "
## 10 ( 1 ) " " " " " " " " " " " " " " " "
## 11 ( 1 ) "*" " " " " " " " " " " " " " "
## 12 ( 1 ) "*" "*" " " " " " " " " " " " "
## 13 ( 1 ) "*" "*" "*" " " " " " " " " " "
## 14 ( 1 ) "*" "*" "*" "*" " " " " " " " "
## 15 ( 1 ) "*" "*" "*" "*" "*" " " " " " "
## spread1 spread2 D2 PPE
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " " " " " " "
## 7 ( 1 ) " " " " " " " "
## 8 ( 1 ) " " " " " " " "
## 9 ( 1 ) " " " " " " " "
## 10 ( 1 ) " " " " " " " "
## 11 ( 1 ) " " " " " " " "
## 12 ( 1 ) " " " " " " " "
## 13 ( 1 ) " " " " " " " "
## 14 ( 1 ) " " " " " " " "
## 15 ( 1 ) " " " " " " " "
names(reg_full_sumavg)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
names(regfit_fullavg)
## [1] "np" "nrbar" "d" "rbar" "thetab" "first"
## [7] "last" "vorder" "tol" "rss" "bound" "nvmax"
## [13] "ress" "ir" "nbest" "lopt" "il" "ier"
## [19] "xnames" "method" "force.in" "force.out" "sserr" "intercept"
## [25] "lindep" "nullrss" "nn" "call"
Plotting
par(mfrow=c(2,2))
#RSS
plot(reg_full_sumavg$rss, xlab="Number of variables", ylab="RSS", type ="l")
mnrss=which.min(reg_full_sumavg$rss)
#Plotting min RSS
points(mnrss, reg_full_sumavg$adjr2[mnrss], col="red", cex=2, pch=20)
#Adjusted R^2
plot(reg_full_sumavg$adjr2, xlab="Number of variables", ylab="Adjusted R^2", type ="l")
mxr2=which.max(reg_full_sumavg$adjr2)
#Plotting max adjusted R^2
points(mxr2, reg_full_sumavg$adjr2[mxr2], col="red", cex=2, pch=20)
#BIC
plot(reg_full_sumavg$bic, xlab="Number of variables", ylab = "BIC", type="l")
mnbic=which.min(reg_full_sum$bic)
points(mnbic, reg_full_sumavg$bic[mnbic], col="green", pch=20, cex=2)
BIC selects for a smaller model (5 predictors) (heavier penalty)
Adjusted R^2 max is around 5 as well.
As expected, RSS decreases as more predictors are added, but this is not indicative of test error rate.
Plotting Selected Variables
plot(regfit_fullavg, scale="r2")
plot(regfit_fullavg, scale="adjr2")
plot(regfit_fullavg, scale="bic")
Coef of models
"model with highest adjusted R^2"
## [1] "model with highest adjusted R^2"
coef(regfit_fullavg, mxr2) #model with highest adjusted R^2
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## 3.723613e+00 -9.071021e-03 3.961108e-04 -4.025395e-03
## MDVP.Jitter... MDVP.Jitter.Abs.
## 1.320928e+02 -1.933071e+04
"model with lowest CP"
## [1] "model with lowest CP"
coef(regfit_fullavg, mncp) #model with lowest CP
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## 3.657245e+00 -8.664956e-03 2.801145e-04 -4.022328e-03
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP
## 1.506663e+02 -1.862220e+04 -3.690324e+01
"model with lowest BIC"
## [1] "model with lowest BIC"
coef(regfit_fullavg, mnbic) #model with lowest BIC
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## 3.723613e+00 -9.071021e-03 3.961108e-04 -4.025395e-03
## MDVP.Jitter... MDVP.Jitter.Abs.
## 1.320928e+02 -1.933071e+04
x=model.matrix(status~., pd2avg)[,-1] #[,-1] to remove the intercept
y=pd2avg$status
#automatically transforms any qualitative variables into dummy variables
#standardizes by default
#alpha=0 ridge regression, alpha=1 lasso
grid=10^seq(10, -2, length=100) #10 to the power of a sequence -> our lambda values
grid
## [1] 1.000000e+10 7.564633e+09 5.722368e+09 4.328761e+09 3.274549e+09
## [6] 2.477076e+09 1.873817e+09 1.417474e+09 1.072267e+09 8.111308e+08
## [11] 6.135907e+08 4.641589e+08 3.511192e+08 2.656088e+08 2.009233e+08
## [16] 1.519911e+08 1.149757e+08 8.697490e+07 6.579332e+07 4.977024e+07
## [21] 3.764936e+07 2.848036e+07 2.154435e+07 1.629751e+07 1.232847e+07
## [26] 9.326033e+06 7.054802e+06 5.336699e+06 4.037017e+06 3.053856e+06
## [31] 2.310130e+06 1.747528e+06 1.321941e+06 1.000000e+06 7.564633e+05
## [36] 5.722368e+05 4.328761e+05 3.274549e+05 2.477076e+05 1.873817e+05
## [41] 1.417474e+05 1.072267e+05 8.111308e+04 6.135907e+04 4.641589e+04
## [46] 3.511192e+04 2.656088e+04 2.009233e+04 1.519911e+04 1.149757e+04
## [51] 8.697490e+03 6.579332e+03 4.977024e+03 3.764936e+03 2.848036e+03
## [56] 2.154435e+03 1.629751e+03 1.232847e+03 9.326033e+02 7.054802e+02
## [61] 5.336699e+02 4.037017e+02 3.053856e+02 2.310130e+02 1.747528e+02
## [66] 1.321941e+02 1.000000e+02 7.564633e+01 5.722368e+01 4.328761e+01
## [71] 3.274549e+01 2.477076e+01 1.873817e+01 1.417474e+01 1.072267e+01
## [76] 8.111308e+00 6.135907e+00 4.641589e+00 3.511192e+00 2.656088e+00
## [81] 2.009233e+00 1.519911e+00 1.149757e+00 8.697490e-01 6.579332e-01
## [86] 4.977024e-01 3.764936e-01 2.848036e-01 2.154435e-01 1.629751e-01
## [91] 1.232847e-01 9.326033e-02 7.054802e-02 5.336699e-02 4.037017e-02
## [96] 3.053856e-02 2.310130e-02 1.747528e-02 1.321941e-02 1.000000e-02
ridge.mod=glmnet(x, y, alpha=0, lambda=grid, family = "binomial")
#rows (predictors), columns (lambda values)
ridge.mod$lambda[50] #lambda value
## [1] 11497.57
coef(ridge.mod)[,50] #coef for this lambda value (index)
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## 1.098296e+00 -3.758310e-07 -1.594538e-07 -4.586503e-07
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ
## 2.825076e-03 4.698392e-01 4.446949e-03 5.073418e-03
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## 1.482471e-03 9.544775e-04 8.826712e-05 1.722701e-03
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## 1.392087e-03 1.037698e-03 5.742150e-04 2.353659e-04
## HNR RPDE DFA spread1
## -3.480791e-06 1.358691e-04 1.742388e-04 2.530471e-05
## spread2 D2 PPE
## 2.975963e-04 4.869601e-05 2.849646e-04
Estimating Test Error
ridge.mod=glmnet(x[trainavg,],y[trainavg],alpha=0,lambda=grid, thresh=1e-12, family = "binomial")
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
ridge.pred=predict(ridge.mod,s=4,newx=x[testavg,])
mean((ridge.pred-y.testavg)^2)
## Warning in Ops.factor(ridge.pred, y.testavg): '-' not meaningful for factors
## [1] NA
#check vs least squares lambda=0
ridge.pred=predict(ridge.mod, s=0, newx=x[testavg,], exact = T, x=x[trainavg,], y=y[trainavg])
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
mean((ridge.pred-y.testavg)^2)
## Warning in Ops.factor(ridge.pred, y.testavg): '-' not meaningful for factors
## [1] NA
lm(y~x, subset = trainavg)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
##
## Call:
## lm(formula = y ~ x, subset = trainavg)
##
## Coefficients:
## (Intercept) xMDVP.Fo.Hz. xMDVP.Fhi.Hz. xMDVP.Flo.Hz.
## 1.616e+01 -1.390e-01 1.053e-02 3.199e-02
## xMDVP.Jitter... xMDVP.Jitter.Abs. xMDVP.RAP xMDVP.PPQ
## 6.457e+01 -7.708e+05 -1.013e+06 4.988e+03
## xJitter.DDP xMDVP.Shimmer xMDVP.Shimmer.dB. xShimmer.APQ3
## 3.390e+05 3.633e+03 -1.177e+01 -7.903e+05
## xShimmer.APQ5 xMDVP.APQ xShimmer.DDA xNHR
## -2.930e+03 -6.484e+02 2.627e+05 -1.010e+01
## xHNR xRPDE xDFA xspread1
## NA NA NA NA
## xspread2 xD2 xPPE
## NA NA NA
predict(ridge.mod,s=0, exact = T, type="coefficients", x=x[trainavg,], y=y[trainavg])[1:20,]
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## 3.982968e+02 -2.515352e-01 -1.176685e-01 -1.135584e-01
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ
## -8.343540e+02 -1.266549e+06 2.306985e+04 -4.446661e+03
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## 1.403851e+03 -3.894412e+01 3.653470e+01 -5.647350e+02
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## 3.416166e+02 -4.248073e+02 8.942996e+01 -1.723648e+02
## HNR RPDE DFA spread1
## -1.050122e+00 -1.443907e+02 -8.948188e+01 3.023055e+01
Cross Validation for Ridge
set.seed(4706)
cv.out=cv.glmnet(x[trainavg,], y[trainavg], alpha=0, family = "binomial")
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
## fold
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
## [1] 2.960083
#predict on test using best lambda
ridge.pred=predict(ridge.mod, s=bestlam, newx=x[testavg,])
mean((ridge.pred-y.testavg)^2)
## Warning in Ops.factor(ridge.pred, y.testavg): '-' not meaningful for factors
## [1] NA
#refit on full data
out=glmnet(x,y,alpha=0, family = "binomial")
predict(out,type="coefficients",s=bestlam)[1:20,]
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz.
## 4.367547e-01 -1.062314e-03 -5.054642e-04 -1.301264e-03
## MDVP.Jitter... MDVP.Jitter.Abs. MDVP.RAP MDVP.PPQ
## 5.349053e+00 1.003401e+03 8.588853e+00 9.797759e+00
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## 2.863335e+00 2.090017e+00 1.888996e-01 3.703174e+00
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA NHR
## 3.005893e+00 2.305308e+00 1.234277e+00 3.349048e-01
## HNR RPDE DFA spread1
## -7.265099e-03 2.978426e-01 5.182344e-01 6.769356e-02
#Fitting on training data
lasso.mod=glmnet(x[trainavg,], y[trainavg], alpha=1, lambda=grid, family="binomial")
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
#Getting best lambda
set.seed(4706)
cv.out=cv.glmnet(x[trainavg,], y[trainavg], alpha=1, family="binomial")
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
## fold
plot(cv.out)
bestlam=cv.out$lambda.min #lambda.min=lambda with lowest error
bestlam
## [1] 0.122311
lasso.pred=predict(lasso.mod, s=bestlam, newx=x[testavg,])
mean((lasso.pred!=y.testavg))
## [1] 1
#Refitting on full data using best lambda
out=glmnet(x, y, alpha=1, lambda=grid, family="binomial")
lasso.coef=predict(out, type="coefficients", s=bestlam)
lasso.coef
## 23 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 7.12316442
## MDVP.Fo.Hz. .
## MDVP.Fhi.Hz. .
## MDVP.Flo.Hz. .
## MDVP.Jitter... .
## MDVP.Jitter.Abs. .
## MDVP.RAP .
## MDVP.PPQ .
## Jitter.DDP .
## MDVP.Shimmer .
## MDVP.Shimmer.dB. .
## Shimmer.APQ3 .
## Shimmer.APQ5 .
## MDVP.APQ .
## Shimmer.DDA .
## NHR .
## HNR .
## RPDE .
## DFA .
## spread1 1.02192948
## spread2 0.07698237
## D2 .
## PPE .
Lasso: spread1+spread2 Best Subsets: MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.Flo.Hz.+MDVP.Jitter…+MDVP.Jitter.Abs.
I fit a logistic regression model on the training data
logist_reg_fit=glm(status~., data=pd2avg, family="binomial", subset=trainavg)
summary(logist_reg_fit)
##
## Call:
## glm(formula = status ~ ., family = "binomial", data = pd2avg,
## subset = trainavg)
##
## Deviance Residuals:
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.498e+02 4.080e+06 0 1
## MDVP.Fo.Hz. -7.109e+00 4.004e+04 0 1
## MDVP.Fhi.Hz. 5.383e-01 3.485e+03 0 1
## MDVP.Flo.Hz. 1.636e+00 2.153e+04 0 1
## MDVP.Jitter... 3.301e+03 4.742e+08 0 1
## MDVP.Jitter.Abs. -3.941e+07 1.998e+11 0 1
## MDVP.RAP -5.177e+07 8.979e+11 0 1
## MDVP.PPQ 2.551e+05 1.311e+09 0 1
## Jitter.DDP 1.733e+07 2.999e+11 0 1
## MDVP.Shimmer 1.858e+05 3.789e+09 0 1
## MDVP.Shimmer.dB. -6.020e+02 3.953e+07 0 1
## Shimmer.APQ3 -4.041e+07 2.469e+11 0 1
## Shimmer.APQ5 -1.498e+05 1.352e+09 0 1
## MDVP.APQ -3.315e+04 1.032e+09 0 1
## Shimmer.DDA 1.343e+07 8.282e+10 0 1
## NHR -5.163e+02 9.754e+07 0 1
## HNR NA NA NA NA
## RPDE NA NA NA NA
## DFA NA NA NA NA
## spread1 NA NA NA NA
## spread2 NA NA NA NA
## D2 NA NA NA NA
## PPE NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.7995e+01 on 15 degrees of freedom
## Residual deviance: 2.5232e-10 on 0 degrees of freedom
## AIC: 32
##
## Number of Fisher Scoring iterations: 24
#"Coefficients"
#coef(logist_reg_fit)
#"P-Values"
#summary(logist_reg_fit)$coef[ ,4]
alias(logist_reg_fit)
## Model :
## status ~ MDVP.Fo.Hz. + MDVP.Fhi.Hz. + MDVP.Flo.Hz. + MDVP.Jitter... +
## MDVP.Jitter.Abs. + MDVP.RAP + MDVP.PPQ + Jitter.DDP + MDVP.Shimmer +
## MDVP.Shimmer.dB. + Shimmer.APQ3 + Shimmer.APQ5 + MDVP.APQ +
## Shimmer.DDA + NHR + HNR + RPDE + DFA + spread1 + spread2 +
## D2 + PPE
##
## Complete :
## (Intercept) MDVP.Fo.Hz. MDVP.Fhi.Hz.
## HNR -82879/5671 -10961/148076 -385325/39128651
## RPDE -158161/156468 -349/170805 -32835/38997197
## DFA 176655/180827 -137763/38059693 0
## spread1 17667495/2986322 -383479/2071270 3524/210831
## spread2 871889/1197555 -8563/505119 5638/5059529
## D2 24770678/2053841 -281/8072 14075/1957267
## PPE 137556/148921 -281/19851 21051/17505865
## MDVP.Flo.Hz. MDVP.Jitter... MDVP.Jitter.Abs.
## HNR 49702/276611 -251418644/43871 -1056184219703/6393458
## RPDE 1942/616161 1064396379/4850359 456427444/983027
## DFA 65500/37454927 -36588967/20966213 -1355726976/79847
## spread1 16466/282463 134301521/132460 -838661367980/876429
## spread2 3833/550609 627524/43743 -62958739/722
## D2 -3521/441912 860819258/9794683 -216199595311/957652
## PPE 17783/3612099 160541211/2472707 -5202576191/71299
## MDVP.RAP MDVP.PPQ Jitter.DDP
## HNR -350953741215/48578 1972495981/418605 1026611379775/425339
## RPDE -2523318741583/11475549 -860119/20075 331672289147/4522823
## DFA 2688385/351 13653250099/106635153 -10904904/4339
## spread1 -3470178386/1679 246243181/40573 3116509553/4512
## spread2 -26599422712/111873 1685536360/2499987 3108416869/39122
## D2 1279509244902/2147443 3445279/4022 -1280208734990/6446441
## PPE -7746462176/42033 1873549/4052 2146996803/34861
## MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3
## HNR 148883753/4547 -38096965/164901 19440390249/10610
## RPDE 1094199/926 -37448/1959 82120533305/629368
## DFA 88932442/2968375 -5780/182241 -68720962054/2017171
## spread1 67015907/9493 -8521374812/137541773 -204402519952/256985
## spread2 21575395/17773 -130899/10544 -13486835327/348808
## D2 -176330590/58491 7107040/196841 -331089559541/528989
## PPE 101139656/156693 -1033384/174191 -447412330/8541
## Shimmer.APQ5 MDVP.APQ Shimmer.DDA
## HNR -133706047907/11078571 -65059252027/7415207 -1153368752/1859
## RPDE -5034363/27323 -47942681/135105 -101367534598/2311399
## DFA -775633/11370 3535/522 196046243/17257
## spread1 -11603605/2876 -1551541081/1026898 10014804295/38011
## spread2 -9951190/17517 -18111419/62975 13110949869/1042237
## D2 6406723/35383 279348338/268789 2283910610/10893
## PPE -15698135687/45933638 -448507/3141 1873408916/108267
## NHR
## HNR -1394957/1514
## RPDE -504175/16323
## DFA -448262/162297
## spread1 -452242799/4523833
## spread2 -569544937/28785675
## D2 141084697/1121771
## PPE -11409/1046
contrasts(pd2avg[,"status"])
## 1
## 0 0
## 1 1
glm_probs = predict(logist_reg_fit, pd2avg[testavg,], type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(glm_probs)
## phon_R01_S35 phon_R01_S16 phon_R01_S18 phon_R01_S08 phon_R01_S02 phon_R01_S01
## 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
glm_pred=ifelse(glm_probs>0.5, 1, 0)
head(glm_pred)
## phon_R01_S35 phon_R01_S16 phon_R01_S18 phon_R01_S08 phon_R01_S02 phon_R01_S01
## 1 0 1 1 0 0
length(glm_pred)
## [1] 16
length(y.testavg)
## [1] 16
table(glm_pred, y.testavg)
## y.testavg
## glm_pred 0 1
## 0 2 6
## 1 2 6
Classification Error
mean(glm_pred!=y.testavg)
## [1] 0.5
Accuracy
mean(glm_pred==y.testavg)
## [1] 0.5
colnames(pd2)
## [1] "MDVP.Fo.Hz." "MDVP.Fhi.Hz." "MDVP.Flo.Hz." "MDVP.Jitter..."
## [5] "MDVP.Jitter.Abs." "MDVP.RAP" "MDVP.PPQ" "Jitter.DDP"
## [9] "MDVP.Shimmer" "MDVP.Shimmer.dB." "Shimmer.APQ3" "Shimmer.APQ5"
## [13] "MDVP.APQ" "Shimmer.DDA" "NHR" "HNR"
## [17] "RPDE" "DFA" "spread1" "spread2"
## [21] "D2" "PPE" "status"
logist_reg_fit2=glm(status~MDVP.Fhi.Hz.+MDVP.APQ+NHR+RPDE+spread1+D2+PPE, data=pd2avg, family=binomial, subset=trainavg)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logist_reg_fit)
##
## Call:
## glm(formula = status ~ ., family = "binomial", data = pd2avg,
## subset = trainavg)
##
## Deviance Residuals:
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.498e+02 4.080e+06 0 1
## MDVP.Fo.Hz. -7.109e+00 4.004e+04 0 1
## MDVP.Fhi.Hz. 5.383e-01 3.485e+03 0 1
## MDVP.Flo.Hz. 1.636e+00 2.153e+04 0 1
## MDVP.Jitter... 3.301e+03 4.742e+08 0 1
## MDVP.Jitter.Abs. -3.941e+07 1.998e+11 0 1
## MDVP.RAP -5.177e+07 8.979e+11 0 1
## MDVP.PPQ 2.551e+05 1.311e+09 0 1
## Jitter.DDP 1.733e+07 2.999e+11 0 1
## MDVP.Shimmer 1.858e+05 3.789e+09 0 1
## MDVP.Shimmer.dB. -6.020e+02 3.953e+07 0 1
## Shimmer.APQ3 -4.041e+07 2.469e+11 0 1
## Shimmer.APQ5 -1.498e+05 1.352e+09 0 1
## MDVP.APQ -3.315e+04 1.032e+09 0 1
## Shimmer.DDA 1.343e+07 8.282e+10 0 1
## NHR -5.163e+02 9.754e+07 0 1
## HNR NA NA NA NA
## RPDE NA NA NA NA
## DFA NA NA NA NA
## spread1 NA NA NA NA
## spread2 NA NA NA NA
## D2 NA NA NA NA
## PPE NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.7995e+01 on 15 degrees of freedom
## Residual deviance: 2.5232e-10 on 0 degrees of freedom
## AIC: 32
##
## Number of Fisher Scoring iterations: 24
glm_probs = predict(logist_reg_fit2, pd2[testavg,], type="response")
head(glm_probs)
## phon_R01_S06_1 phon_R01_S02_4 phon_R01_S02_6 phon_R01_S02_1 phon_R01_S01_2
## 1 1 1 1 1
## phon_R01_S01_1
## 1
glm_pred=ifelse(glm_probs>0.5, 1, 0)
head(glm_pred)
## phon_R01_S06_1 phon_R01_S02_4 phon_R01_S02_6 phon_R01_S02_1 phon_R01_S01_2
## 1 1 1 1 1
## phon_R01_S01_1
## 1
table(glm_pred, y.testavg)
## y.testavg
## glm_pred 0 1
## 0 3 0
## 1 1 12
"Error"
## [1] "Error"
mean(glm_pred!=y.testavg)
## [1] 0.0625
"Accuracy"
## [1] "Accuracy"
mean(glm_pred==y.testavg)
## [1] 0.9375
lda_fit = lda(status~., data=pd2avg[,-5], subset=trainavg)
## Warning in lda.default(x, grouping, ...): variables are collinear
lda_fit
## Call:
## lda(status ~ ., data = pd2avg[, -5], subset = trainavg)
##
## Prior probabilities of groups:
## 0 1
## 0.25 0.75
##
## Group means:
## MDVP.Fo.Hz. MDVP.Fhi.Hz. MDVP.Flo.Hz. MDVP.Jitter... MDVP.RAP MDVP.PPQ
## 0 166.4436 221.5269 142.05758 0.003315833 0.001477083 0.001695833
## 1 146.5777 204.6702 99.45608 0.006195972 0.003278056 0.003306905
## Jitter.DDP MDVP.Shimmer MDVP.Shimmer.dB. Shimmer.APQ3 Shimmer.APQ5
## 0 0.004432500 0.01602375 0.1435833 0.008297917 0.009444583
## 1 0.009835417 0.03158137 0.2931925 0.016695397 0.018639960
## MDVP.APQ Shimmer.DDA NHR HNR RPDE DFA spread1
## 0 0.01297917 0.02489500 0.005145833 26.33188 0.4559212 0.7122583 -6.873966
## 1 0.02601188 0.05008696 0.029137917 21.33347 0.5212461 0.7051521 -5.482128
## spread2 D2 PPE
## 0 0.1620567 2.062159 0.1155605
## 1 0.2425816 2.437170 0.2220227
##
## Coefficients of linear discriminants:
## LD1
## MDVP.Fo.Hz. 7.229350e-02
## MDVP.Fhi.Hz. -3.762147e-02
## MDVP.Flo.Hz. -1.148403e-01
## MDVP.Jitter... -1.365518e+03
## MDVP.RAP 9.213403e+00
## MDVP.PPQ 1.128217e+03
## Jitter.DDP 3.393728e+00
## MDVP.Shimmer 3.300002e+01
## MDVP.Shimmer.dB. 8.437520e+00
## Shimmer.APQ3 -3.084104e+02
## Shimmer.APQ5 1.326483e+03
## MDVP.APQ -7.479707e+02
## Shimmer.DDA -1.014674e+02
## NHR 9.625385e+01
## HNR 4.064078e-01
## RPDE -2.087497e+01
## DFA 2.252635e+01
## spread1 9.047795e+00
## spread2 1.562251e+01
## D2 -1.877527e+00
## PPE -3.901749e+01
plot(lda_fit) #produces plots of the linear discriminants
lda_pred = predict(lda_fit, pd2avg[testavg,])
names(lda_pred) #the names are class, contains LDA’s predictions
## [1] "class" "posterior" "x"
lda_class = lda_pred$class #the class predictions of our fitted model on test data
table(lda_class, y.testavg) #predictions vs real values of test data
## y.testavg
## lda_class 0 1
## 0 2 1
## 1 2 11
contrasts(pd2[,"status"]) #the one with value=1 is the one for which we are getting the posterior probability
## 1
## 0 0
## 1 1
"Accuracy"
## [1] "Accuracy"
mean(lda_class==y.testavg)
## [1] 0.8125
"Classification Error"
## [1] "Classification Error"
mean(lda_class!=y.testavg)
## [1] 0.1875
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100
sum(lda_pred$posterior[,1]>=0.5) #observations with predicted proba >0.5
## [1] 3
sum(lda_pred$posterior[,1]<0.5)
## [1] 13
#MY OWN POSTERIOR PROBABILITY THRESHOLD
#sum(lda_pred$posterior[,1]>0.9) #observations with predicted proba >0.9
#my_threshlold_classes=ifelse(lda_pred$posterior[,1]>0.9, "yes", "no") #yes being the one with contrast=1
#CV for LDA
k=10 #number of folds
Dataset=pd2avg
set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
## 1 2 3 4 5 6 7 8 9 10
## 3 6 4 1 2 2 5 3 1 5
error=vector()#error for each fold
for (j in 1:k) #j refers to folds
{
lda_fit = lda(status~., data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold
lda_pred = predict(lda_fit, Dataset[folds==j,]) #test data, current fold
lda_class = lda_pred$class #the class predictions of our fitted model on test data
error=append(error, mean(lda_class!=Dataset[folds==j, "status"])) #cv error
}
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
cv_error=mean(error)
cv_error
## [1] 0.2233333
Using Predictors Selected by Lasso
lda_fit = lda(status~spread1+spread2, data=pd2avg[,-5], subset=trainavg)
lda_fit
## Call:
## lda(status ~ spread1 + spread2, data = pd2avg[, -5], subset = trainavg)
##
## Prior probabilities of groups:
## 0 1
## 0.25 0.75
##
## Group means:
## spread1 spread2
## 0 -6.873966 0.1620567
## 1 -5.482128 0.2425816
##
## Coefficients of linear discriminants:
## LD1
## spread1 1.296697
## spread2 3.163055
plot(lda_fit) #produces plots of the linear discriminants
lda_pred = predict(lda_fit, pd2avg[testavg,])
names(lda_pred) #the names are class, contains LDA’s predictions
## [1] "class" "posterior" "x"
lda_class = lda_pred$class #the class predictions of our fitted model on test data
table(lda_class, y.testavg) #predictions vs real values of test data
## y.testavg
## lda_class 0 1
## 0 2 1
## 1 2 11
contrasts(pd2[,"status"]) #the one with value=1 is the one for which we are getting the posterior probability
## 1
## 0 0
## 1 1
"Accuracy"
## [1] "Accuracy"
mean(lda_class==y.testavg)
## [1] 0.8125
"Classification Error"
## [1] "Classification Error"
mean(lda_class!=y.testavg)
## [1] 0.1875
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100
sum(lda_pred$posterior[,1]>=0.5) #observations with predicted proba >0.5
## [1] 3
sum(lda_pred$posterior[,1]<0.5)
## [1] 13
#MY OWN POSTERIOR PROBABILITY THRESHOLD
#sum(lda_pred$posterior[,1]>0.9) #observations with predicted proba >0.9
#my_threshlold_classes=ifelse(lda_pred$posterior[,1]>0.9, "yes", "no") #yes being the one with contrast=1
#CV for LDA
k=10 #number of folds
Dataset=pd2avg
set.seed(4706)
folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
Dataset=cbind(Dataset, folds)
table(folds) #to see that they are balanced
## folds
## 1 2 3 4 5 6 7 8 9 10
## 3 6 4 1 2 2 5 3 1 5
error=vector()#error for each fold
for (j in 1:k) #j refers to folds
{
lda_fit = lda(status~spread1+spread2, data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold
lda_pred = predict(lda_fit, Dataset[folds==j,]) #test data, current fold
lda_class = lda_pred$class #the class predictions of our fitted model on test data
error=append(error, mean(lda_class!=Dataset[folds==j, "status"])) #cv error
}
cv_error=mean(error)
cv_error
## [1] 0.07833333
#qda_fit=qda(status~., data=pd2avg, subset = trainavg)
#qda_fit
#qda_class=predict(qda_fit, pd2avg[testavg,])$class
#table(qda_class, y.testavg)
#"Accuracy"
#mean(qda_class==y.testavg)
#"Classification Error"
#mean(qda_class!=y.testavg)
#FNR=(FN/total positives)x100
#FPR=(FP/total negatives)x100 = 1-specificity = Type I error
#overall error=(FP+FN)/total x100
#sensitivity = TPR = (TP/Total positives) x100
Using Predictors Selected by Lasso
#CV for QDA
#k=10 #number of folds
#Dataset=pd2
#set.seed(4706)
#folds=sample(1:k, nrow(Dataset), replace=TRUE)
#folds
#Dataset=cbind(Dataset, folds)
#table(folds) #to see that they are balanced
#error=vector()#error for each fold
#for (j in 1:k) #j refers to folds
#{
#qda_fit = qda(status~MDVP.Fo.Hz.+MDVP.Fhi.Hz.+MDVP.APQ+DFA+spread1+spread2+D2, data=Dataset[folds!=j,-5]) #this is the training data, those NOT in the current fold
#qda_class=predict(qda_fit, Dataset[folds==j,])$class #test data, current fold
#error=append(error, mean(qda_class!=Dataset[folds==j, "status"])) #cv error
#}
#cv_error=mean(error)
#cv_error
tree.pd=tree(pd2avg[,"status"]~., pd2avg[,1:22], subset=trainavg)
summary(tree.pd)
##
## Classification tree:
## tree(formula = pd2avg[, "status"] ~ ., data = pd2avg[, 1:22],
## subset = trainavg)
## Variables actually used in tree construction:
## [1] "MDVP.RAP"
## Number of terminal nodes: 2
## Residual mean deviance: 0.5456 = 7.638 / 14
## Misclassification error rate: 0.125 = 2 / 16
plot(tree.pd)
text(tree.pd, pretty=0)
Spread1 seems to be the most important predictor here.
tree.pred=predict(tree.pd, pd2avg[testavg,], type="class")
table(tree.pred, y.testavg)
## y.testavg
## tree.pred 0 1
## 0 2 2
## 1 2 10
"Accuracy"
## [1] "Accuracy"
mean(tree.pred==y.testavg)
## [1] 0.75
"Error"
## [1] "Error"
mean(tree.pred!=y.testavg)
## [1] 0.25
The test error rate seems good. We consider pruning the tree.
set.seed(4706)
cv.pd=cv.tree(tree.pd, FUN=prune.misclass)
names(cv.pd)
## [1] "size" "dev" "k" "method"
cv.pd
## $size
## [1] 2 1
##
## $dev
## [1] 6 7
##
## $k
## [1] -Inf 2
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
par(mfrow=c(1,2))
plot(cv.pd$size, cv.pd$dev, type="b", ylab="cv error")
plot(cv.pd$k, cv.pd$dev, type="b", ylab = "cv error")
Prune the tree and obtain the 2-node tree
pruned.pd=prune.misclass(tree.pd, best=2)
plot(pruned.pd)
text(pruned.pd, pretty=0)
pruned.pred=predict(pruned.pd, pd2avg[testavg,], type="class")
table(pruned.pred, y.testavg)
## y.testavg
## pruned.pred 0 1
## 0 2 2
## 1 2 10
"Accuracy"
## [1] "Accuracy"
mean(pruned.pred==y.testavg)
## [1] 0.75
"Error"
## [1] "Error"
mean(pruned.pred!=y.testavg)
## [1] 0.25
The unpruned tree actually performed better than the pruned tree.
BAGGING
#bagging
set.seed(4706)
bag.pd=randomForest(pd2avg[,"status"]~., pd2avg[,1:22], subset=trainavg, mtry=13, importance=TRUE)
#mtry: Number of variables randomly sampled as candidates at each split. Note that the default values are different for classification (sqrt(p) where p is number of variables in x) and regression (p/3)
bag.pd
##
## Call:
## randomForest(formula = pd2avg[, "status"] ~ ., data = pd2avg[, 1:22], mtry = 13, importance = TRUE, subset = trainavg)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 13
##
## OOB estimate of error rate: 18.75%
## Confusion matrix:
## 0 1 class.error
## 0 2 2 0.50000000
## 1 1 11 0.08333333
plot(bag.pd)
yhat.bag=predict(bag.pd, newdata = pd2avg[testavg,])
plot(yhat.bag, y.testavg, xlab="bagging pred", ylab="actual")
"Error"
## [1] "Error"
mean(yhat.bag!=y.testavg)
## [1] 0.1875
"Accuracy"
## [1] "Accuracy"
mean(yhat.bag==y.testavg)
## [1] 0.8125
RANDOM FOREST
set.seed(4706)
rf.pd=randomForest(pd2avg[,"status"]~., pd2avg[,1:22], subset=trainavg, mtry=6, importance=TRUE)
yhat.rf=predict(rf.pd, newdata = pd2avg[testavg,])
rf.pd
##
## Call:
## randomForest(formula = pd2avg[, "status"] ~ ., data = pd2avg[, 1:22], mtry = 6, importance = TRUE, subset = trainavg)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 18.75%
## Confusion matrix:
## 0 1 class.error
## 0 2 2 0.50000000
## 1 1 11 0.08333333
plot(rf.pd)
"Error"
## [1] "Error"
mean(yhat.rf!=y.testavg)
## [1] 0.1875
"Accuracy"
## [1] "Accuracy"
mean(yhat.rf==y.testavg)
## [1] 0.8125
importance(rf.pd)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## MDVP.Fo.Hz. 0.0000000 -1.7815347 -1.5904517 0.09742812
## MDVP.Fhi.Hz. 0.0000000 -1.9219142 -1.9590384 0.10147491
## MDVP.Flo.Hz. 0.9052750 -0.8170415 -0.1455245 0.06940220
## MDVP.Jitter... -0.4473031 -0.2018101 -0.3513199 0.06910079
## MDVP.Jitter.Abs. 1.5753235 1.4027521 1.6479090 0.08396429
## MDVP.RAP 4.3303869 3.5541055 4.2763050 0.63893095
## MDVP.PPQ 3.1495376 2.5798419 3.0042845 0.24788533
## Jitter.DDP 4.2965188 2.9975250 3.9475962 0.74593191
## MDVP.Shimmer 1.1353541 -0.8093464 -0.2458390 0.12854286
## MDVP.Shimmer.dB. 0.4927843 1.3850664 1.2311200 0.28830539
## Shimmer.APQ3 1.3440623 1.6477889 2.1561750 0.08777047
## Shimmer.APQ5 0.8256994 1.7337508 1.8069892 0.19978665
## MDVP.APQ 1.2437604 -2.0813379 -1.3907097 0.28018105
## Shimmer.DDA -1.0010015 -0.5346753 -0.7146504 0.06769658
## NHR 3.4916476 2.0107387 2.7937183 0.71689267
## HNR -1.0010015 -0.3358583 -0.7477619 0.10969811
## RPDE -1.7372705 -1.0010015 -1.6925888 0.03998810
## DFA -1.7026641 -1.2703560 -1.6412227 0.05978581
## spread1 3.4975234 3.2373497 3.7180401 0.40137408
## spread2 1.4170505 -1.1150545 -0.3283415 0.18264000
## D2 3.2717921 1.6619264 2.7608042 0.62698535
## PPE 4.3453977 4.0778719 4.4180393 0.47423438
#ibrary(gbm)
#gbm() with the option distribution="gaussian" since this is a regression problem; if it were a bi- nary classification problem, we would use distribution="bernoulli". The argument n.trees=5000 indicates that we want 5000 trees, and the option interaction.depth=4 limits the depth of each tree
#set.seed(4706)
#boost.pd=gbm(pd2avg[trainavg,"status"]~., pd2avg[trainavg,1:22], distribution = "gaussian", n.trees=5000, interaction.depth=4)
#shrinkage=0.001 by default
#can change it
#summary(boost.pd)
#par(mfrow=c(1,2))
#plot(boost.pd, i="spread1")
#plot(boost.pd, i="MDVP.Fo.Hz.")
P-values are all very large
contrasts(pd2avg[,"status"])
## 1
## 0 0
## 1 1
glm_probsavg = predict(logist_reg_fit, pd2avg[testavg,], type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(glm_probsavg)
## phon_R01_S35 phon_R01_S16 phon_R01_S18 phon_R01_S08 phon_R01_S02 phon_R01_S01
## 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16
glm_predavg=ifelse(glm_probsavg>0.5, 1, 0)
head(glm_predavg)
## phon_R01_S35 phon_R01_S16 phon_R01_S18 phon_R01_S08 phon_R01_S02 phon_R01_S01
## 1 0 1 1 0 0
length(glm_predavg)
## [1] 16
length(y.testavg)
## [1] 16
table(glm_predavg, y.testavg)
## y.testavg
## glm_predavg 0 1
## 0 2 6
## 1 2 6
Classification Error
mean(glm_predavg!=y.testavg)
## [1] 0.5
Accuracy
mean(glm_predavg==y.testavg)
## [1] 0.5
Logistic Regression 0.1752577 LDA (full) 0.1172264 QDA (full) 0.1384934 Tree (full) 0.1443299 Tree (pruned) 0.1649485 Bagging 0.07216495 Random Forest 0.08247423